# Copyright (c) 2012, Lambda Foundry, Inc. # See LICENSE for the license from collections import defaultdict from csv import ( QUOTE_MINIMAL, QUOTE_NONE, QUOTE_NONNUMERIC, ) import sys import time import warnings from pandas.errors import ParserError from pandas.util._exceptions import find_stack_level from pandas import StringDtype from pandas.core.arrays import ( ArrowExtensionArray, BooleanArray, FloatingArray, IntegerArray, ) cimport cython from cpython.bytes cimport PyBytes_AsString from cpython.exc cimport ( PyErr_Fetch, PyErr_Occurred, ) from cpython.object cimport PyObject from cpython.ref cimport ( Py_INCREF, Py_XDECREF, ) from cpython.unicode cimport ( PyUnicode_AsUTF8String, PyUnicode_Decode, PyUnicode_DecodeUTF8, ) from cython cimport Py_ssize_t from libc.stdlib cimport free from libc.string cimport ( strcasecmp, strlen, strncpy, ) cdef extern from "Python.h": # TODO(cython3): get this from cpython.unicode object PyUnicode_FromString(char *v) import numpy as np cimport numpy as cnp from numpy cimport ( float64_t, int64_t, ndarray, uint8_t, uint64_t, ) cnp.import_array() from pandas._libs cimport util from pandas._libs.util cimport ( INT64_MAX, INT64_MIN, UINT64_MAX, ) from pandas._libs import lib from pandas._libs.khash cimport ( kh_destroy_float64, kh_destroy_str, kh_destroy_str_starts, kh_destroy_strbox, kh_exist_str, kh_float64_t, kh_get_float64, kh_get_str, kh_get_str_starts_item, kh_get_strbox, kh_init_float64, kh_init_str, kh_init_str_starts, kh_init_strbox, kh_put_float64, kh_put_str, kh_put_str_starts_item, kh_put_strbox, kh_resize_float64, kh_resize_str_starts, kh_str_starts_t, kh_str_t, kh_strbox_t, khiter_t, ) from pandas.errors import ( EmptyDataError, ParserError, ParserWarning, ) from pandas.core.dtypes.common import ( is_bool_dtype, is_datetime64_dtype, is_extension_array_dtype, is_float_dtype, is_integer_dtype, is_object_dtype, ) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.inference import is_dict_like cdef: float64_t INF = np.inf float64_t NEGINF = -INF int64_t DEFAULT_CHUNKSIZE = 256 * 1024 cdef extern from "headers/portable.h": # I *think* this is here so that strcasecmp is defined on Windows # so we don't get # `parsers.obj : error LNK2001: unresolved external symbol strcasecmp` # in Appveyor. # In a sane world, the `from libc.string cimport` above would fail # loudly. pass cdef extern from "parser/tokenizer.h": ctypedef enum ParserState: START_RECORD START_FIELD ESCAPED_CHAR IN_FIELD IN_QUOTED_FIELD ESCAPE_IN_QUOTED_FIELD QUOTE_IN_QUOTED_FIELD EAT_CRNL EAT_CRNL_NOP EAT_WHITESPACE EAT_COMMENT EAT_LINE_COMMENT WHITESPACE_LINE SKIP_LINE FINISHED enum: ERROR_OVERFLOW ctypedef enum BadLineHandleMethod: ERROR, WARN, SKIP ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read, int *status, const char *encoding_errors) ctypedef int (*io_cleanup)(void *src) ctypedef struct parser_t: void *source io_callback cb_io io_cleanup cb_cleanup int64_t chunksize # Number of bytes to prepare for each chunk char *data # pointer to data to be processed int64_t datalen # amount of data available int64_t datapos # where to write out tokenized data char *stream uint64_t stream_len uint64_t stream_cap # Store words in (potentially ragged) matrix for now, hmm char **words int64_t *word_starts # where we are in the stream uint64_t words_len uint64_t words_cap uint64_t max_words_cap # maximum word cap encountered char *pword_start # pointer to stream start of current field int64_t word_start # position start of current field int64_t *line_start # position in words for start of line int64_t *line_fields # Number of fields in each line uint64_t lines # Number of lines observed uint64_t file_lines # Number of lines observed (with bad/skipped) uint64_t lines_cap # Vector capacity # Tokenizing stuff ParserState state int doublequote # is " represented by ""? */ char delimiter # field separator */ int delim_whitespace # consume tabs / spaces instead char quotechar # quote character */ char escapechar # escape character */ char lineterminator int skipinitialspace # ignore spaces following delimiter? */ int quoting # style of quoting to write */ char commentchar int allow_embedded_newline int usecols Py_ssize_t expected_fields BadLineHandleMethod on_bad_lines # floating point options char decimal char sci # thousands separator (comma, period) char thousands int header # Boolean: 1: has header, 0: no header int64_t header_start # header row start uint64_t header_end # header row end void *skipset PyObject *skipfunc int64_t skip_first_N_rows int64_t skipfooter # pick one, depending on whether the converter requires GIL float64_t (*double_converter)(const char *, char **, char, char, char, int, int *, int *) nogil # error handling char *warn_msg char *error_msg int64_t skip_empty_lines ctypedef struct coliter_t: char **words int64_t *line_start int64_t col ctypedef struct uint_state: int seen_sint int seen_uint int seen_null void uint_state_init(uint_state *self) int uint64_conflict(uint_state *self) void coliter_setup(coliter_t *it, parser_t *parser, int64_t i, int64_t start) nogil void COLITER_NEXT(coliter_t, const char *) nogil parser_t* parser_new() int parser_init(parser_t *self) nogil void parser_free(parser_t *self) nogil void parser_del(parser_t *self) nogil int parser_add_skiprow(parser_t *self, int64_t row) int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) void parser_set_default_options(parser_t *self) int parser_consume_rows(parser_t *self, size_t nrows) int parser_trim_buffers(parser_t *self) int tokenize_all_rows(parser_t *self, const char *encoding_errors) nogil int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) nogil int64_t str_to_int64(char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep) nogil uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max, uint64_t uint_max, int *error, char tsep) nogil float64_t xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int) nogil float64_t precise_xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int) nogil float64_t round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int) nogil int to_boolean(const char *item, uint8_t *val) nogil cdef extern from "parser/io.h": void *new_rd_source(object obj) except NULL int del_rd_source(void *src) void* buffer_rd_bytes(void *source, size_t nbytes, size_t *bytes_read, int *status, const char *encoding_errors) cdef class TextReader: """ # source: StringIO or file object ..versionchange:: 1.2.0 removed 'compression', 'memory_map', and 'encoding' argument. These arguments are outsourced to CParserWrapper. 'source' has to be a file handle. """ cdef: parser_t *parser object na_fvalues object true_values, false_values object handle object orig_header bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns bint allow_leading_cols uint64_t parser_start # this is modified after __init__ list clocks const char *encoding_errors kh_str_starts_t *false_set kh_str_starts_t *true_set int64_t buffer_lines, skipfooter list dtype_cast_order # list[np.dtype] list names # can be None set noconvert # set[int] cdef public: int64_t leading_cols, table_width object delimiter # bytes or str object converters object na_values list header # list[list[non-negative integers]] object index_col object skiprows object dtype object usecols set unnamed_cols # set[str] str dtype_backend def __cinit__(self, source, delimiter=b",", # bytes | str header=0, int64_t header_start=0, uint64_t header_end=0, index_col=None, names=None, tokenize_chunksize=DEFAULT_CHUNKSIZE, bint delim_whitespace=False, converters=None, bint skipinitialspace=False, escapechar=None, # bytes | str bint doublequote=True, quotechar=b'"', quoting=0, # int lineterminator=None, # bytes | str comment=None, decimal=b".", # bytes | str thousands=None, # bytes | str dtype=None, usecols=None, on_bad_lines=ERROR, bint na_filter=True, na_values=None, na_fvalues=None, bint keep_default_na=True, true_values=None, false_values=None, bint allow_leading_cols=True, skiprows=None, skipfooter=0, # int64_t bint verbose=False, float_precision=None, bint skip_blank_lines=True, encoding_errors=b"strict", dtype_backend="numpy"): # set encoding for native Python and C library if isinstance(encoding_errors, str): encoding_errors = encoding_errors.encode("utf-8") elif encoding_errors is None: encoding_errors = b"strict" Py_INCREF(encoding_errors) self.encoding_errors = PyBytes_AsString(encoding_errors) self.parser = parser_new() self.parser.chunksize = tokenize_chunksize # For timekeeping self.clocks = [] self.parser.usecols = (usecols is not None) self._setup_parser_source(source) parser_set_default_options(self.parser) parser_init(self.parser) if delim_whitespace: self.parser.delim_whitespace = delim_whitespace else: if len(delimiter) > 1: raise ValueError("only length-1 separators excluded right now") self.parser.delimiter = ord(delimiter) # ---------------------------------------- # parser options self.parser.doublequote = doublequote self.parser.skipinitialspace = skipinitialspace self.parser.skip_empty_lines = skip_blank_lines if lineterminator is not None: if len(lineterminator) != 1: raise ValueError("Only length-1 line terminators supported") self.parser.lineterminator = ord(lineterminator) if len(decimal) != 1: raise ValueError("Only length-1 decimal markers supported") self.parser.decimal = ord(decimal) if thousands is not None: if len(thousands) != 1: raise ValueError("Only length-1 thousands markers supported") self.parser.thousands = ord(thousands) if escapechar is not None: if len(escapechar) != 1: raise ValueError("Only length-1 escapes supported") self.parser.escapechar = ord(escapechar) self._set_quoting(quotechar, quoting) dtype_order = ["int64", "float64", "bool", "object"] if quoting == QUOTE_NONNUMERIC: # consistent with csv module semantics, cast all to float dtype_order = dtype_order[1:] self.dtype_cast_order = [np.dtype(x) for x in dtype_order] if comment is not None: if len(comment) > 1: raise ValueError("Only length-1 comment characters supported") self.parser.commentchar = ord(comment) self.parser.on_bad_lines = on_bad_lines self.skiprows = skiprows if skiprows is not None: self._make_skiprow_set() self.skipfooter = skipfooter if usecols is not None: self.has_usecols = 1 # GH-20558, validate usecols at higher level and only pass clean # usecols into TextReader. self.usecols = usecols if skipfooter > 0: self.parser.on_bad_lines = SKIP self.delimiter = delimiter self.na_values = na_values if na_fvalues is None: na_fvalues = set() self.na_fvalues = na_fvalues self.true_values = _maybe_encode(true_values) + _true_values self.false_values = _maybe_encode(false_values) + _false_values self.true_set = kset_from_list(self.true_values) self.false_set = kset_from_list(self.false_values) self.keep_default_na = keep_default_na self.converters = converters self.na_filter = na_filter self.verbose = verbose if float_precision == "round_trip": # see gh-15140 self.parser.double_converter = round_trip elif float_precision == "legacy": self.parser.double_converter = xstrtod elif float_precision == "high" or float_precision is None: self.parser.double_converter = precise_xstrtod else: raise ValueError(f"Unrecognized float_precision option: " f"{float_precision}") # Caller is responsible for ensuring we have one of # - None # - DtypeObj # - dict[Any, DtypeObj] self.dtype = dtype self.dtype_backend = dtype_backend self.noconvert = set() self.index_col = index_col # ---------------------------------------- # header stuff self.allow_leading_cols = allow_leading_cols self.leading_cols = 0 # updated in _get_header # TODO: no header vs. header is not the first row self.has_mi_columns = 0 self.orig_header = header if header is None: # sentinel value self.parser.header_start = -1 self.parser.header_end = -1 self.parser.header = -1 self.parser_start = 0 prelim_header = [] else: if isinstance(header, list): if len(header) > 1: # need to artificially skip the final line # which is still a header line header = list(header) header.append(header[-1] + 1) self.parser.header_end = header[-1] self.has_mi_columns = 1 else: self.parser.header_end = header[0] self.parser_start = header[-1] + 1 self.parser.header_start = header[0] self.parser.header = header[0] prelim_header = header else: self.parser.header_start = header self.parser.header_end = header self.parser_start = header + 1 self.parser.header = header prelim_header = [header] self.names = names header, table_width, unnamed_cols = self._get_header(prelim_header) # header, table_width, and unnamed_cols are set here, never changed self.header = header self.table_width = table_width self.unnamed_cols = unnamed_cols if not self.table_width: raise EmptyDataError("No columns to parse from file") # Compute buffer_lines as function of table width. heuristic = 2**20 // self.table_width self.buffer_lines = 1 while self.buffer_lines * 2 < heuristic: self.buffer_lines *= 2 def __init__(self, *args, **kwargs): pass def __dealloc__(self): _close(self) parser_del(self.parser) def close(self): _close(self) def _set_quoting(self, quote_char: str | bytes | None, quoting: int): if not isinstance(quoting, int): raise TypeError('"quoting" must be an integer') if not QUOTE_MINIMAL <= quoting <= QUOTE_NONE: raise TypeError('bad "quoting" value') if not isinstance(quote_char, (str, bytes)) and quote_char is not None: dtype = type(quote_char).__name__ raise TypeError(f'"quotechar" must be string, not {dtype}') if quote_char is None or quote_char == "": if quoting != QUOTE_NONE: raise TypeError("quotechar must be set if quoting enabled") self.parser.quoting = quoting self.parser.quotechar = -1 elif len(quote_char) > 1: # 0-len case handled earlier raise TypeError('"quotechar" must be a 1-character string') else: self.parser.quoting = quoting self.parser.quotechar = ord(quote_char) cdef _make_skiprow_set(self): if util.is_integer_object(self.skiprows): parser_set_skipfirstnrows(self.parser, self.skiprows) elif not callable(self.skiprows): for i in self.skiprows: parser_add_skiprow(self.parser, i) else: self.parser.skipfunc = self.skiprows cdef _setup_parser_source(self, source): cdef: void *ptr ptr = new_rd_source(source) self.parser.source = ptr self.parser.cb_io = &buffer_rd_bytes self.parser.cb_cleanup = &del_rd_source cdef _get_header(self, list prelim_header): # header is now a list of lists, so field_count should use header[0] # # modifies: # self.parser attributes # self.parser_start # self.leading_cols cdef: Py_ssize_t i, start, field_count, passed_count, unnamed_count, level char *word str name uint64_t hr, data_line = 0 list header = [] set unnamed_cols = set() if self.parser.header_start >= 0: # Header is in the file for level, hr in enumerate(prelim_header): this_header = [] if self.parser.lines < hr + 1: self._tokenize_rows(hr + 2) if self.parser.lines == 0: field_count = 0 start = self.parser.line_start[0] # e.g., if header=3 and file only has 2 lines elif (self.parser.lines < hr + 1 and not isinstance(self.orig_header, list)) or ( self.parser.lines < hr): msg = self.orig_header if isinstance(msg, list): joined = ",".join(str(m) for m in msg) msg = f"[{joined}], len of {len(msg)}," raise ParserError( f"Passed header={msg} but only " f"{self.parser.lines} lines in file") else: field_count = self.parser.line_fields[hr] start = self.parser.line_start[hr] unnamed_count = 0 unnamed_col_indices = [] for i in range(field_count): word = self.parser.words[start + i] name = PyUnicode_DecodeUTF8(word, strlen(word), self.encoding_errors) if name == "": if self.has_mi_columns: name = f"Unnamed: {i}_level_{level}" else: name = f"Unnamed: {i}" unnamed_count += 1 unnamed_col_indices.append(i) this_header.append(name) if not self.has_mi_columns: # Ensure that regular columns are used before unnamed ones # to keep given names and mangle unnamed columns col_loop_order = [i for i in range(len(this_header)) if i not in unnamed_col_indices ] + unnamed_col_indices counts = {} for i in col_loop_order: col = this_header[i] old_col = col cur_count = counts.get(col, 0) if cur_count > 0: while cur_count > 0: counts[old_col] = cur_count + 1 col = f"{old_col}.{cur_count}" if col in this_header: cur_count += 1 else: cur_count = counts.get(col, 0) if ( self.dtype is not None and is_dict_like(self.dtype) and self.dtype.get(old_col) is not None and self.dtype.get(col) is None ): self.dtype.update({col: self.dtype.get(old_col)}) this_header[i] = col counts[col] = cur_count + 1 if self.has_mi_columns: # If we have grabbed an extra line, but it's not in our # format, save in the buffer, and create an blank extra # line for the rest of the parsing code. if hr == prelim_header[-1]: lc = len(this_header) ic = (len(self.index_col) if self.index_col is not None else 0) # if wrong number of blanks or no index, not our format if (lc != unnamed_count and lc - ic > unnamed_count) or ic == 0: hr -= 1 self.parser_start -= 1 this_header = [None] * lc data_line = hr + 1 header.append(this_header) unnamed_cols.update({this_header[i] for i in unnamed_col_indices}) if self.names is not None: header = [self.names] elif self.names is not None: # Names passed if self.parser.lines < 1: if not self.has_usecols: self.parser.expected_fields = len(self.names) self._tokenize_rows(1) header = [self.names] if self.parser.lines < 1: field_count = len(header[0]) else: field_count = self.parser.line_fields[data_line] # Enforce this unless usecols if not self.has_usecols: self.parser.expected_fields = max(field_count, len(self.names)) else: # No header passed nor to be found in the file if self.parser.lines < 1: self._tokenize_rows(1) return None, self.parser.line_fields[0], unnamed_cols # Corner case, not enough lines in the file if self.parser.lines < data_line + 1: field_count = len(header[0]) else: field_count = self.parser.line_fields[data_line] # #2981 if self.names is not None: field_count = max(field_count, len(self.names)) passed_count = len(header[0]) if (self.has_usecols and self.allow_leading_cols and not callable(self.usecols)): nuse = len(self.usecols) if nuse == passed_count: self.leading_cols = 0 elif self.names is None and nuse < passed_count: self.leading_cols = field_count - passed_count elif passed_count != field_count: raise ValueError("Number of passed names did not match number of " "header fields in the file") # oh boy, #2442, #2981 elif self.allow_leading_cols and passed_count < field_count: self.leading_cols = field_count - passed_count return header, field_count, unnamed_cols def read(self, rows: int | None = None) -> dict[int, "ArrayLike"]: """ rows=None --> read all rows """ # Don't care about memory usage columns = self._read_rows(rows, 1) return columns def read_low_memory(self, rows: int | None)-> list[dict[int, "ArrayLike"]]: """ rows=None --> read all rows """ # Conserve intermediate space # Caller is responsible for concatenating chunks, # see c_parser_wrapper._concatenate_chunks cdef: size_t rows_read = 0 list chunks = [] if rows is None: while True: try: chunk = self._read_rows(self.buffer_lines, 0) if len(chunk) == 0: break except StopIteration: break else: chunks.append(chunk) else: while rows_read < rows: try: crows = min(self.buffer_lines, rows - rows_read) chunk = self._read_rows(crows, 0) if len(chunk) == 0: break rows_read += len(list(chunk.values())[0]) except StopIteration: break else: chunks.append(chunk) parser_trim_buffers(self.parser) if len(chunks) == 0: raise StopIteration return chunks cdef _tokenize_rows(self, size_t nrows): cdef: int status with nogil: status = tokenize_nrows(self.parser, nrows, self.encoding_errors) self._check_tokenize_status(status) cdef _check_tokenize_status(self, int status): if self.parser.warn_msg != NULL: print(PyUnicode_DecodeUTF8( self.parser.warn_msg, strlen(self.parser.warn_msg), self.encoding_errors), file=sys.stderr) free(self.parser.warn_msg) self.parser.warn_msg = NULL if status < 0: raise_parser_error("Error tokenizing data", self.parser) # -> dict[int, "ArrayLike"] cdef _read_rows(self, rows, bint trim): cdef: int64_t buffered_lines int64_t irows self._start_clock() if rows is not None: irows = rows buffered_lines = self.parser.lines - self.parser_start if buffered_lines < irows: self._tokenize_rows(irows - buffered_lines) if self.skipfooter > 0: raise ValueError("skipfooter can only be used to read " "the whole file") else: with nogil: status = tokenize_all_rows(self.parser, self.encoding_errors) self._check_tokenize_status(status) if self.parser_start >= self.parser.lines: raise StopIteration self._end_clock("Tokenization") self._start_clock() columns = self._convert_column_data(rows) self._end_clock("Type conversion") self._start_clock() if len(columns) > 0: rows_read = len(list(columns.values())[0]) # trim parser_consume_rows(self.parser, rows_read) if trim: parser_trim_buffers(self.parser) self.parser_start -= rows_read self._end_clock("Parser memory cleanup") return columns cdef _start_clock(self): self.clocks.append(time.time()) cdef _end_clock(self, str what): if self.verbose: elapsed = time.time() - self.clocks.pop(-1) print(f"{what} took: {elapsed * 1000:.2f} ms") def set_noconvert(self, i: int) -> None: self.noconvert.add(i) def remove_noconvert(self, i: int) -> None: self.noconvert.remove(i) def _convert_column_data(self, rows: int | None) -> dict[int, "ArrayLike"]: cdef: int64_t i int nused kh_str_starts_t *na_hashset = NULL int64_t start, end object name, na_flist, col_dtype = None bint na_filter = 0 int64_t num_cols dict results start = self.parser_start if rows is None: end = self.parser.lines else: end = min(start + rows, self.parser.lines) num_cols = -1 # Py_ssize_t cast prevents build warning for i in range(self.parser.lines): num_cols = (num_cols < self.parser.line_fields[i]) * \ self.parser.line_fields[i] + \ (num_cols >= self.parser.line_fields[i]) * num_cols usecols_not_callable_and_exists = not callable(self.usecols) and self.usecols names_larger_num_cols = (self.names and len(self.names) - self.leading_cols > num_cols) if self.table_width - self.leading_cols > num_cols: if (usecols_not_callable_and_exists and self.table_width - self.leading_cols < len(self.usecols) or names_larger_num_cols): raise ParserError(f"Too many columns specified: expected " f"{self.table_width - self.leading_cols} " f"and found {num_cols}") if (usecols_not_callable_and_exists and all(isinstance(u, int) for u in self.usecols)): missing_usecols = [col for col in self.usecols if col >= num_cols] if missing_usecols: raise ParserError( "Defining usecols without of bounds indices is not allowed. " f"{missing_usecols} are out of bounds.", ) results = {} nused = 0 is_default_dict_dtype = isinstance(self.dtype, defaultdict) for i in range(self.table_width): if i < self.leading_cols: # Pass through leading columns always name = i elif (self.usecols and not callable(self.usecols) and nused == len(self.usecols)): # Once we've gathered all requested columns, stop. GH5766 break else: name = self._get_column_name(i, nused) usecols = set() if callable(self.usecols): if self.usecols(name): usecols = {i} else: usecols = self.usecols if self.has_usecols and not (i in usecols or name in usecols): continue nused += 1 conv = self._get_converter(i, name) col_dtype = None if self.dtype is not None: if isinstance(self.dtype, dict): if name in self.dtype: col_dtype = self.dtype[name] elif i in self.dtype: col_dtype = self.dtype[i] elif is_default_dict_dtype: col_dtype = self.dtype[name] else: if self.dtype.names: # structured array col_dtype = np.dtype(self.dtype.descr[i][1]) else: col_dtype = self.dtype if conv: if col_dtype is not None: warnings.warn((f"Both a converter and dtype were specified " f"for column {name} - only the converter will " f"be used."), ParserWarning, stacklevel=find_stack_level()) results[i] = _apply_converter(conv, self.parser, i, start, end) continue # Collect the list of NaN values associated with the column. # If we aren't supposed to do that, or none are collected, # we set `na_filter` to `0` (`1` otherwise). na_flist = set() if self.na_filter: na_list, na_flist = self._get_na_list(i, name) if na_list is None: na_filter = 0 else: na_filter = 1 na_hashset = kset_from_list(na_list) else: na_filter = 0 # Attempt to parse tokens and infer dtype of the column. # Should return as the desired dtype (inferred or specified). try: col_res, na_count = self._convert_tokens( i, start, end, name, na_filter, na_hashset, na_flist, col_dtype) finally: # gh-21353 # # Cleanup the NaN hash that we generated # to avoid memory leaks. if na_filter: self._free_na_set(na_hashset) # don't try to upcast EAs if ( na_count > 0 and not is_extension_array_dtype(col_dtype) or self.dtype_backend != "numpy" ): use_dtype_backend = self.dtype_backend != "numpy" and col_dtype is None col_res = _maybe_upcast( col_res, use_dtype_backend=use_dtype_backend, dtype_backend=self.dtype_backend, ) if col_res is None: raise ParserError(f"Unable to parse column {i}") results[i] = col_res self.parser_start += end - start return results # -> tuple["ArrayLike", int]: cdef _convert_tokens(self, Py_ssize_t i, int64_t start, int64_t end, object name, bint na_filter, kh_str_starts_t *na_hashset, object na_flist, object col_dtype): if col_dtype is not None: col_res, na_count = self._convert_with_dtype( col_dtype, i, start, end, na_filter, 1, na_hashset, na_flist) # Fallback on the parse (e.g. we requested int dtype, # but its actually a float). if col_res is not None: return col_res, na_count if i in self.noconvert: return self._string_convert(i, start, end, na_filter, na_hashset) else: col_res = None for dt in self.dtype_cast_order: try: col_res, na_count = self._convert_with_dtype( dt, i, start, end, na_filter, 0, na_hashset, na_flist) except ValueError: # This error is raised from trying to convert to uint64, # and we discover that we cannot convert to any numerical # dtype successfully. As a result, we leave the data # column AS IS with object dtype. col_res, na_count = self._convert_with_dtype( np.dtype("object"), i, start, end, 0, 0, na_hashset, na_flist) except OverflowError: col_res, na_count = self._convert_with_dtype( np.dtype("object"), i, start, end, na_filter, 0, na_hashset, na_flist) if col_res is not None: break # we had a fallback parse on the dtype, so now try to cast if col_res is not None and col_dtype is not None: # If col_res is bool, it might actually be a bool array mixed with NaNs # (see _try_bool_flex()). Usually this would be taken care of using # _maybe_upcast(), but if col_dtype is a floating type we should just # take care of that cast here. if col_res.dtype == np.bool_ and is_float_dtype(col_dtype): mask = col_res.view(np.uint8) == na_values[np.uint8] col_res = col_res.astype(col_dtype) np.putmask(col_res, mask, np.nan) return col_res, na_count # NaNs are already cast to True here, so can not use astype if col_res.dtype == np.bool_ and is_integer_dtype(col_dtype): if na_count > 0: raise ValueError( f"cannot safely convert passed user dtype of " f"{col_dtype} for {np.bool_} dtyped data in " f"column {i} due to NA values" ) # only allow safe casts, eg. with a nan you cannot safely cast to int try: col_res = col_res.astype(col_dtype, casting="safe") except TypeError: # float -> int conversions can fail the above # even with no nans col_res_orig = col_res col_res = col_res.astype(col_dtype) if (col_res != col_res_orig).any(): raise ValueError( f"cannot safely convert passed user dtype of " f"{col_dtype} for {col_res_orig.dtype.name} dtyped data in " f"column {i}") return col_res, na_count cdef _convert_with_dtype(self, object dtype, Py_ssize_t i, int64_t start, int64_t end, bint na_filter, bint user_dtype, kh_str_starts_t *na_hashset, object na_flist): if isinstance(dtype, CategoricalDtype): # TODO: I suspect that _categorical_convert could be # optimized when dtype is an instance of CategoricalDtype codes, cats, na_count = _categorical_convert( self.parser, i, start, end, na_filter, na_hashset) # Method accepts list of strings, not encoded ones. true_values = [x.decode() for x in self.true_values] array_type = dtype.construct_array_type() cat = array_type._from_inferred_categories( cats, codes, dtype, true_values=true_values) return cat, na_count elif is_extension_array_dtype(dtype): result, na_count = self._string_convert(i, start, end, na_filter, na_hashset) array_type = dtype.construct_array_type() try: # use _from_sequence_of_strings if the class defines it if is_bool_dtype(dtype): true_values = [x.decode() for x in self.true_values] false_values = [x.decode() for x in self.false_values] result = array_type._from_sequence_of_strings( result, dtype=dtype, true_values=true_values, false_values=false_values) else: result = array_type._from_sequence_of_strings(result, dtype=dtype) except NotImplementedError: raise NotImplementedError( f"Extension Array: {array_type} must implement " f"_from_sequence_of_strings in order " f"to be used in parser methods") return result, na_count elif is_integer_dtype(dtype): try: result, na_count = _try_int64(self.parser, i, start, end, na_filter, na_hashset) if user_dtype and na_count is not None: if na_count > 0: raise ValueError(f"Integer column has NA values in column {i}") except OverflowError: result = _try_uint64(self.parser, i, start, end, na_filter, na_hashset) na_count = 0 if result is not None and dtype != "int64": result = result.astype(dtype) return result, na_count elif is_float_dtype(dtype): result, na_count = _try_double(self.parser, i, start, end, na_filter, na_hashset, na_flist) if result is not None and dtype != "float64": result = result.astype(dtype) return result, na_count elif is_bool_dtype(dtype): result, na_count = _try_bool_flex(self.parser, i, start, end, na_filter, na_hashset, self.true_set, self.false_set) if user_dtype and na_count is not None: if na_count > 0: raise ValueError(f"Bool column has NA values in column {i}") return result, na_count elif dtype.kind == "S": # TODO: na handling width = dtype.itemsize if width > 0: result = _to_fw_string(self.parser, i, start, end, width) return result, 0 # treat as a regular string parsing return self._string_convert(i, start, end, na_filter, na_hashset) elif dtype.kind == "U": width = dtype.itemsize if width > 0: raise TypeError(f"the dtype {dtype} is not supported for parsing") # unicode variable width return self._string_convert(i, start, end, na_filter, na_hashset) elif is_object_dtype(dtype): return self._string_convert(i, start, end, na_filter, na_hashset) elif is_datetime64_dtype(dtype): raise TypeError(f"the dtype {dtype} is not supported " f"for parsing, pass this column " f"using parse_dates instead") else: raise TypeError(f"the dtype {dtype} is not supported for parsing") # -> tuple[ndarray[object], int] cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end, bint na_filter, kh_str_starts_t *na_hashset): return _string_box_utf8(self.parser, i, start, end, na_filter, na_hashset, self.encoding_errors) def _get_converter(self, i: int, name): if self.converters is None: return None if name is not None and name in self.converters: return self.converters[name] # Converter for position, if any return self.converters.get(i) cdef _get_na_list(self, Py_ssize_t i, name): # Note: updates self.na_values, self.na_fvalues if self.na_values is None: return None, set() if isinstance(self.na_values, dict): key = None values = None if name is not None and name in self.na_values: key = name elif i in self.na_values: key = i else: # No na_values provided for this column. if self.keep_default_na: return _NA_VALUES, set() return list(), set() values = self.na_values[key] if values is not None and not isinstance(values, list): values = list(values) fvalues = self.na_fvalues[key] if fvalues is not None and not isinstance(fvalues, set): fvalues = set(fvalues) return _ensure_encoded(values), fvalues else: if not isinstance(self.na_values, list): self.na_values = list(self.na_values) if not isinstance(self.na_fvalues, set): self.na_fvalues = set(self.na_fvalues) return _ensure_encoded(self.na_values), self.na_fvalues cdef _free_na_set(self, kh_str_starts_t *table): kh_destroy_str_starts(table) cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused): cdef int64_t j if self.has_usecols and self.names is not None: if (not callable(self.usecols) and len(self.names) == len(self.usecols)): return self.names[nused] else: return self.names[i - self.leading_cols] else: if self.header is not None: j = i - self.leading_cols # generate extra (bogus) headers if there are more columns than headers # These should be strings, not integers, because otherwise we might get # issues with callables as usecols GH#46997 if j >= len(self.header[0]): return str(j) elif self.has_mi_columns: return tuple(header_row[j] for header_row in self.header) else: return self.header[0][j] else: return None # Factor out code common to TextReader.__dealloc__ and TextReader.close # It cannot be a class method, since calling self.close() in __dealloc__ # which causes a class attribute lookup and violates best practices # https://cython.readthedocs.io/en/latest/src/userguide/special_methods.html#finalization-method-dealloc cdef _close(TextReader reader): # also preemptively free all allocated memory parser_free(reader.parser) if reader.true_set: kh_destroy_str_starts(reader.true_set) reader.true_set = NULL if reader.false_set: kh_destroy_str_starts(reader.false_set) reader.false_set = NULL cdef: object _true_values = [b"True", b"TRUE", b"true"] object _false_values = [b"False", b"FALSE", b"false"] def _ensure_encoded(list lst): cdef: list result = [] for x in lst: if isinstance(x, str): x = PyUnicode_AsUTF8String(x) elif not isinstance(x, bytes): x = str(x).encode("utf-8") result.append(x) return result # common NA values # no longer excluding inf representations # '1.#INF','-1.#INF', '1.#INF000000', STR_NA_VALUES = { "-1.#IND", "1.#QNAN", "1.#IND", "-1.#QNAN", "#N/A N/A", "#N/A", "N/A", "n/a", "NA", "", "#NA", "NULL", "null", "NaN", "-NaN", "nan", "-nan", "", "None", } _NA_VALUES = _ensure_encoded(list(STR_NA_VALUES)) def _maybe_upcast( arr, use_dtype_backend: bool = False, dtype_backend: str = "numpy" ): """Sets nullable dtypes or upcasts if nans are present. Upcast, if use_dtype_backend is false and nans are present so that the current dtype can not hold the na value. We use nullable dtypes if the flag is true for every array. Parameters ---------- arr: ndarray Numpy array that is potentially being upcast. use_dtype_backend: bool, default False If true, we cast to the associated nullable dtypes. Returns ------- The casted array. """ if is_extension_array_dtype(arr.dtype): # TODO: the docstring says arr is an ndarray, in which case this cannot # be reached. Is that incorrect? return arr na_value = na_values[arr.dtype] if issubclass(arr.dtype.type, np.integer): mask = arr == na_value if use_dtype_backend: arr = IntegerArray(arr, mask) else: arr = arr.astype(float) np.putmask(arr, mask, np.nan) elif arr.dtype == np.bool_: mask = arr.view(np.uint8) == na_value if use_dtype_backend: arr = BooleanArray(arr, mask) else: arr = arr.astype(object) np.putmask(arr, mask, np.nan) elif issubclass(arr.dtype.type, float) or arr.dtype.type == np.float32: if use_dtype_backend: mask = np.isnan(arr) arr = FloatingArray(arr, mask) elif arr.dtype == np.object_: if use_dtype_backend: arr = StringDtype().construct_array_type()._from_sequence(arr) if use_dtype_backend and dtype_backend == "pyarrow": import pyarrow as pa if isinstance(arr, IntegerArray) and arr.isna().all(): # use null instead of int64 in pyarrow arr = arr.to_numpy() arr = ArrowExtensionArray(pa.array(arr, from_pandas=True)) return arr # ---------------------------------------------------------------------- # Type conversions / inference support code # -> tuple[ndarray[object], int] cdef _string_box_utf8(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_starts_t *na_hashset, const char *encoding_errors): cdef: int na_count = 0 Py_ssize_t i, lines coliter_t it const char *word = NULL ndarray[object] result int ret = 0 kh_strbox_t *table object pyval object NA = na_values[np.object_] khiter_t k table = kh_init_strbox() lines = line_end - line_start result = np.empty(lines, dtype=np.object_) coliter_setup(&it, parser, col, line_start) for i in range(lines): COLITER_NEXT(it, word) if na_filter: if kh_get_str_starts_item(na_hashset, word): # in the hash table na_count += 1 result[i] = NA continue k = kh_get_strbox(table, word) # in the hash table if k != table.n_buckets: # this increments the refcount, but need to test pyval = table.vals[k] else: # box it. new ref? pyval = PyUnicode_Decode(word, strlen(word), "utf-8", encoding_errors) k = kh_put_strbox(table, word, &ret) table.vals[k] = pyval result[i] = pyval kh_destroy_strbox(table) return result, na_count @cython.boundscheck(False) cdef _categorical_convert(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_starts_t *na_hashset): "Convert column data into codes, categories" cdef: int na_count = 0 Py_ssize_t i, lines coliter_t it const char *word = NULL int64_t NA = -1 int64_t[::1] codes int64_t current_category = 0 int ret = 0 kh_str_t *table khiter_t k lines = line_end - line_start codes = np.empty(lines, dtype=np.int64) # factorize parsed values, creating a hash table # bytes -> category code with nogil: table = kh_init_str() coliter_setup(&it, parser, col, line_start) for i in range(lines): COLITER_NEXT(it, word) if na_filter: if kh_get_str_starts_item(na_hashset, word): # is in NA values na_count += 1 codes[i] = NA continue k = kh_get_str(table, word) # not in the hash table if k == table.n_buckets: k = kh_put_str(table, word, &ret) table.vals[k] = current_category current_category += 1 codes[i] = table.vals[k] # parse and box categories to python strings result = np.empty(table.n_occupied, dtype=np.object_) for k in range(table.n_buckets): if kh_exist_str(table, k): result[table.vals[k]] = PyUnicode_FromString(table.keys[k]) kh_destroy_str(table) return np.asarray(codes), result, na_count # -> ndarray[f'|S{width}'] cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, int64_t width): cdef: char *data ndarray result result = np.empty(line_end - line_start, dtype=f"|S{width}") data = result.data with nogil: _to_fw_string_nogil(parser, col, line_start, line_end, width, data) return result cdef void _to_fw_string_nogil(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, size_t width, char *data) nogil: cdef: int64_t i coliter_t it const char *word = NULL coliter_setup(&it, parser, col, line_start) for i in range(line_end - line_start): COLITER_NEXT(it, word) strncpy(data, word, width) data += width cdef: char* cinf = b"inf" char* cposinf = b"+inf" char* cneginf = b"-inf" char* cinfty = b"Infinity" char* cposinfty = b"+Infinity" char* cneginfty = b"-Infinity" # -> tuple[ndarray[float64_t], int] | tuple[None, None] cdef _try_double(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_starts_t *na_hashset, object na_flist): cdef: int error, na_count = 0 Py_ssize_t lines float64_t *data float64_t NA = na_values[np.float64] kh_float64_t *na_fset ndarray[float64_t] result bint use_na_flist = len(na_flist) > 0 lines = line_end - line_start result = np.empty(lines, dtype=np.float64) data = result.data na_fset = kset_float64_from_list(na_flist) with nogil: error = _try_double_nogil(parser, parser.double_converter, col, line_start, line_end, na_filter, na_hashset, use_na_flist, na_fset, NA, data, &na_count) kh_destroy_float64(na_fset) if error != 0: return None, None return result, na_count cdef int _try_double_nogil(parser_t *parser, float64_t (*double_converter)( const char *, char **, char, char, char, int, int *, int *) nogil, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_starts_t *na_hashset, bint use_na_flist, const kh_float64_t *na_flist, float64_t NA, float64_t *data, int *na_count) nogil: cdef: int error = 0, Py_ssize_t i, lines = line_end - line_start coliter_t it const char *word = NULL char *p_end khiter_t k64 na_count[0] = 0 coliter_setup(&it, parser, col, line_start) if na_filter: for i in range(lines): COLITER_NEXT(it, word) if kh_get_str_starts_item(na_hashset, word): # in the hash table na_count[0] += 1 data[0] = NA else: data[0] = double_converter(word, &p_end, parser.decimal, parser.sci, parser.thousands, 1, &error, NULL) if error != 0 or p_end == word or p_end[0]: error = 0 if (strcasecmp(word, cinf) == 0 or strcasecmp(word, cposinf) == 0 or strcasecmp(word, cinfty) == 0 or strcasecmp(word, cposinfty) == 0): data[0] = INF elif (strcasecmp(word, cneginf) == 0 or strcasecmp(word, cneginfty) == 0): data[0] = NEGINF else: return 1 if use_na_flist: k64 = kh_get_float64(na_flist, data[0]) if k64 != na_flist.n_buckets: na_count[0] += 1 data[0] = NA data += 1 else: for i in range(lines): COLITER_NEXT(it, word) data[0] = double_converter(word, &p_end, parser.decimal, parser.sci, parser.thousands, 1, &error, NULL) if error != 0 or p_end == word or p_end[0]: error = 0 if (strcasecmp(word, cinf) == 0 or strcasecmp(word, cposinf) == 0 or strcasecmp(word, cinfty) == 0 or strcasecmp(word, cposinfty) == 0): data[0] = INF elif (strcasecmp(word, cneginf) == 0 or strcasecmp(word, cneginfty) == 0): data[0] = NEGINF else: return 1 data += 1 return 0 cdef _try_uint64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_starts_t *na_hashset): cdef: int error Py_ssize_t lines coliter_t it uint64_t *data ndarray result uint_state state lines = line_end - line_start result = np.empty(lines, dtype=np.uint64) data = result.data uint_state_init(&state) coliter_setup(&it, parser, col, line_start) with nogil: error = _try_uint64_nogil(parser, col, line_start, line_end, na_filter, na_hashset, data, &state) if error != 0: if error == ERROR_OVERFLOW: # Can't get the word variable raise OverflowError("Overflow") return None if uint64_conflict(&state): raise ValueError("Cannot convert to numerical dtype") if state.seen_sint: raise OverflowError("Overflow") return result cdef int _try_uint64_nogil(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, const kh_str_starts_t *na_hashset, uint64_t *data, uint_state *state) nogil: cdef: int error Py_ssize_t i, lines = line_end - line_start coliter_t it const char *word = NULL coliter_setup(&it, parser, col, line_start) if na_filter: for i in range(lines): COLITER_NEXT(it, word) if kh_get_str_starts_item(na_hashset, word): # in the hash table state.seen_null = 1 data[i] = 0 continue data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX, &error, parser.thousands) if error != 0: return error else: for i in range(lines): COLITER_NEXT(it, word) data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX, &error, parser.thousands) if error != 0: return error return 0 cdef _try_int64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, kh_str_starts_t *na_hashset): cdef: int error, na_count = 0 Py_ssize_t lines coliter_t it int64_t *data ndarray result int64_t NA = na_values[np.int64] lines = line_end - line_start result = np.empty(lines, dtype=np.int64) data = result.data coliter_setup(&it, parser, col, line_start) with nogil: error = _try_int64_nogil(parser, col, line_start, line_end, na_filter, na_hashset, NA, data, &na_count) if error != 0: if error == ERROR_OVERFLOW: # Can't get the word variable raise OverflowError("Overflow") return None, None return result, na_count cdef int _try_int64_nogil(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, const kh_str_starts_t *na_hashset, int64_t NA, int64_t *data, int *na_count) nogil: cdef: int error Py_ssize_t i, lines = line_end - line_start coliter_t it const char *word = NULL na_count[0] = 0 coliter_setup(&it, parser, col, line_start) if na_filter: for i in range(lines): COLITER_NEXT(it, word) if kh_get_str_starts_item(na_hashset, word): # in the hash table na_count[0] += 1 data[i] = NA continue data[i] = str_to_int64(word, INT64_MIN, INT64_MAX, &error, parser.thousands) if error != 0: return error else: for i in range(lines): COLITER_NEXT(it, word) data[i] = str_to_int64(word, INT64_MIN, INT64_MAX, &error, parser.thousands) if error != 0: return error return 0 # -> tuple[ndarray[bool], int] cdef _try_bool_flex(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, const kh_str_starts_t *na_hashset, const kh_str_starts_t *true_hashset, const kh_str_starts_t *false_hashset): cdef: int error, na_count = 0 Py_ssize_t lines uint8_t *data ndarray result uint8_t NA = na_values[np.bool_] lines = line_end - line_start result = np.empty(lines, dtype=np.uint8) data = result.data with nogil: error = _try_bool_flex_nogil(parser, col, line_start, line_end, na_filter, na_hashset, true_hashset, false_hashset, NA, data, &na_count) if error != 0: return None, None return result.view(np.bool_), na_count cdef int _try_bool_flex_nogil(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, const kh_str_starts_t *na_hashset, const kh_str_starts_t *true_hashset, const kh_str_starts_t *false_hashset, uint8_t NA, uint8_t *data, int *na_count) nogil: cdef: int error = 0 Py_ssize_t i, lines = line_end - line_start coliter_t it const char *word = NULL na_count[0] = 0 coliter_setup(&it, parser, col, line_start) if na_filter: for i in range(lines): COLITER_NEXT(it, word) if kh_get_str_starts_item(na_hashset, word): # in the hash table na_count[0] += 1 data[0] = NA data += 1 continue if kh_get_str_starts_item(true_hashset, word): data[0] = 1 data += 1 continue if kh_get_str_starts_item(false_hashset, word): data[0] = 0 data += 1 continue error = to_boolean(word, data) if error != 0: return error data += 1 else: for i in range(lines): COLITER_NEXT(it, word) if kh_get_str_starts_item(true_hashset, word): data[0] = 1 data += 1 continue if kh_get_str_starts_item(false_hashset, word): data[0] = 0 data += 1 continue error = to_boolean(word, data) if error != 0: return error data += 1 return 0 cdef kh_str_starts_t* kset_from_list(list values) except NULL: # caller takes responsibility for freeing the hash table cdef: Py_ssize_t i kh_str_starts_t *table int ret = 0 object val table = kh_init_str_starts() for i in range(len(values)): val = values[i] # None creeps in sometimes, which isn't possible here if not isinstance(val, bytes): kh_destroy_str_starts(table) raise ValueError("Must be all encoded bytes") kh_put_str_starts_item(table, PyBytes_AsString(val), &ret) if table.table.n_buckets <= 128: # Resize the hash table to make it almost empty, this # reduces amount of hash collisions on lookup thus # "key not in table" case is faster. # Note that this trades table memory footprint for lookup speed. kh_resize_str_starts(table, table.table.n_buckets * 8) return table cdef kh_float64_t* kset_float64_from_list(values) except NULL: # caller takes responsibility for freeing the hash table cdef: kh_float64_t *table int ret = 0 float64_t val object value table = kh_init_float64() for value in values: val = float(value) kh_put_float64(table, val, &ret) if table.n_buckets <= 128: # See reasoning in kset_from_list kh_resize_float64(table, table.n_buckets * 8) return table cdef raise_parser_error(object base, parser_t *parser): cdef: object old_exc object exc_type PyObject *type PyObject *value PyObject *traceback if PyErr_Occurred(): PyErr_Fetch(&type, &value, &traceback) Py_XDECREF(traceback) if value != NULL: old_exc = value Py_XDECREF(value) # PyErr_Fetch only returned the error message in *value, # so the Exception class must be extracted from *type. if isinstance(old_exc, str): if type != NULL: exc_type = type else: exc_type = ParserError Py_XDECREF(type) raise exc_type(old_exc) else: Py_XDECREF(type) raise old_exc message = f"{base}. C error: " if parser.error_msg != NULL: message += parser.error_msg.decode("utf-8") else: message += "no error message set" raise ParserError(message) # ---------------------------------------------------------------------- # NA values def _compute_na_values(): int64info = np.iinfo(np.int64) int32info = np.iinfo(np.int32) int16info = np.iinfo(np.int16) int8info = np.iinfo(np.int8) uint64info = np.iinfo(np.uint64) uint32info = np.iinfo(np.uint32) uint16info = np.iinfo(np.uint16) uint8info = np.iinfo(np.uint8) na_values = { np.float32: np.nan, np.float64: np.nan, np.int64: int64info.min, np.int32: int32info.min, np.int16: int16info.min, np.int8: int8info.min, np.uint64: uint64info.max, np.uint32: uint32info.max, np.uint16: uint16info.max, np.uint8: uint8info.max, np.bool_: uint8info.max, np.object_: np.nan, } return na_values na_values = _compute_na_values() for k in list(na_values): na_values[np.dtype(k)] = na_values[k] # -> ArrayLike cdef _apply_converter(object f, parser_t *parser, int64_t col, int64_t line_start, int64_t line_end): cdef: Py_ssize_t i, lines coliter_t it const char *word = NULL ndarray[object] result object val lines = line_end - line_start result = np.empty(lines, dtype=np.object_) coliter_setup(&it, parser, col, line_start) for i in range(lines): COLITER_NEXT(it, word) val = PyUnicode_FromString(word) result[i] = f(val) return lib.maybe_convert_objects(result) cdef list _maybe_encode(list values): if values is None: return [] return [x.encode("utf-8") if isinstance(x, str) else x for x in values] def sanitize_objects(ndarray[object] values, set na_values) -> int: """ Convert specified values, including the given set na_values to np.nan. Parameters ---------- values : ndarray[object] na_values : set Returns ------- na_count : int """ cdef: Py_ssize_t i, n object val, onan Py_ssize_t na_count = 0 dict memo = {} n = len(values) onan = np.nan for i in range(n): val = values[i] if val in na_values: values[i] = onan na_count += 1 elif val in memo: values[i] = memo[val] else: memo[val] = val return na_count