# cython: language_level=3, initializedcheck=False # cython: warn.maybe_uninitialized=True, warn.unused=True from cython cimport Py_ssize_t from libc.stddef cimport size_t from libc.stdint cimport ( int64_t, uint8_t, uint16_t, uint32_t, uint64_t, ) from libc.stdlib cimport ( calloc, free, ) import numpy as np import pandas.io.sas.sas_constants as const cdef object np_nan = np.nan cdef struct Buffer: # Convenience wrapper for uint8_t data to allow fast and safe reads and writes. # We use this as a replacement for np.array(..., dtype=np.uint8) because it's # much slower to create NumPy arrays and we create Buffer instances many times # when reading a SAS7BDAT file (roughly once per row that is being read). uint8_t *data size_t length cdef uint8_t buf_get(Buffer buf, size_t offset) except? 255: assert offset < buf.length, "Out of bounds read" return buf.data[offset] cdef bint buf_set(Buffer buf, size_t offset, uint8_t value) except 0: assert offset < buf.length, "Out of bounds write" buf.data[offset] = value return True cdef bytes buf_as_bytes(Buffer buf, size_t offset, size_t length): assert offset + length <= buf.length, "Out of bounds read" return buf.data[offset:offset+length] cdef Buffer buf_new(size_t length) except *: cdef uint8_t *data = calloc(length, sizeof(uint8_t)) if data == NULL: raise MemoryError(f"Failed to allocate {length} bytes") return Buffer(data, length) cdef buf_free(Buffer buf): if buf.data != NULL: free(buf.data) # rle_decompress decompresses data using a Run Length Encoding # algorithm. It is partially documented here: # # https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf cdef int rle_decompress(Buffer inbuff, Buffer outbuff) except? 0: cdef: uint8_t control_byte, x int rpos = 0 int i, nbytes, end_of_first_byte size_t ipos = 0 Py_ssize_t _ while ipos < inbuff.length: control_byte = buf_get(inbuff, ipos) & 0xF0 end_of_first_byte = (buf_get(inbuff, ipos) & 0x0F) ipos += 1 if control_byte == 0x00: nbytes = (buf_get(inbuff, ipos)) + 64 + end_of_first_byte * 256 ipos += 1 for _ in range(nbytes): buf_set(outbuff, rpos, buf_get(inbuff, ipos)) rpos += 1 ipos += 1 elif control_byte == 0x40: # not documented nbytes = (buf_get(inbuff, ipos)) + 18 + end_of_first_byte * 256 ipos += 1 for _ in range(nbytes): buf_set(outbuff, rpos, buf_get(inbuff, ipos)) rpos += 1 ipos += 1 elif control_byte == 0x60: nbytes = end_of_first_byte * 256 + (buf_get(inbuff, ipos)) + 17 ipos += 1 for _ in range(nbytes): buf_set(outbuff, rpos, 0x20) rpos += 1 elif control_byte == 0x70: nbytes = end_of_first_byte * 256 + (buf_get(inbuff, ipos)) + 17 ipos += 1 for _ in range(nbytes): buf_set(outbuff, rpos, 0x00) rpos += 1 elif control_byte == 0x80: nbytes = end_of_first_byte + 1 for i in range(nbytes): buf_set(outbuff, rpos, buf_get(inbuff, ipos + i)) rpos += 1 ipos += nbytes elif control_byte == 0x90: nbytes = end_of_first_byte + 17 for i in range(nbytes): buf_set(outbuff, rpos, buf_get(inbuff, ipos + i)) rpos += 1 ipos += nbytes elif control_byte == 0xA0: nbytes = end_of_first_byte + 33 for i in range(nbytes): buf_set(outbuff, rpos, buf_get(inbuff, ipos + i)) rpos += 1 ipos += nbytes elif control_byte == 0xB0: nbytes = end_of_first_byte + 49 for i in range(nbytes): buf_set(outbuff, rpos, buf_get(inbuff, ipos + i)) rpos += 1 ipos += nbytes elif control_byte == 0xC0: nbytes = end_of_first_byte + 3 x = buf_get(inbuff, ipos) ipos += 1 for _ in range(nbytes): buf_set(outbuff, rpos, x) rpos += 1 elif control_byte == 0xD0: nbytes = end_of_first_byte + 2 for _ in range(nbytes): buf_set(outbuff, rpos, 0x40) rpos += 1 elif control_byte == 0xE0: nbytes = end_of_first_byte + 2 for _ in range(nbytes): buf_set(outbuff, rpos, 0x20) rpos += 1 elif control_byte == 0xF0: nbytes = end_of_first_byte + 2 for _ in range(nbytes): buf_set(outbuff, rpos, 0x00) rpos += 1 else: raise ValueError(f"unknown control byte: {control_byte}") return rpos # rdc_decompress decompresses data using the Ross Data Compression algorithm: # # http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm cdef int rdc_decompress(Buffer inbuff, Buffer outbuff) except? 0: cdef: uint8_t cmd uint16_t ctrl_bits = 0, ctrl_mask = 0, ofs, cnt int rpos = 0, k, ii size_t ipos = 0 ii = -1 while ipos < inbuff.length: ii += 1 ctrl_mask = ctrl_mask >> 1 if ctrl_mask == 0: ctrl_bits = ((buf_get(inbuff, ipos) << 8) + buf_get(inbuff, ipos + 1)) ipos += 2 ctrl_mask = 0x8000 if ctrl_bits & ctrl_mask == 0: buf_set(outbuff, rpos, buf_get(inbuff, ipos)) ipos += 1 rpos += 1 continue cmd = (buf_get(inbuff, ipos) >> 4) & 0x0F cnt = (buf_get(inbuff, ipos) & 0x0F) ipos += 1 # short RLE if cmd == 0: cnt += 3 for k in range(cnt): buf_set(outbuff, rpos + k, buf_get(inbuff, ipos)) rpos += cnt ipos += 1 # long RLE elif cmd == 1: cnt += buf_get(inbuff, ipos) << 4 cnt += 19 ipos += 1 for k in range(cnt): buf_set(outbuff, rpos + k, buf_get(inbuff, ipos)) rpos += cnt ipos += 1 # long pattern elif cmd == 2: ofs = cnt + 3 ofs += buf_get(inbuff, ipos) << 4 ipos += 1 cnt = buf_get(inbuff, ipos) ipos += 1 cnt += 16 for k in range(cnt): buf_set(outbuff, rpos + k, buf_get(outbuff, rpos - ofs + k)) rpos += cnt # short pattern else: ofs = cnt + 3 ofs += buf_get(inbuff, ipos) << 4 ipos += 1 for k in range(cmd): buf_set(outbuff, rpos + k, buf_get(outbuff, rpos - ofs + k)) rpos += cmd return rpos cdef enum ColumnTypes: column_type_decimal = 1 column_type_string = 2 # Const aliases assert len(const.page_meta_types) == 2 cdef: int page_meta_types_0 = const.page_meta_types[0] int page_meta_types_1 = const.page_meta_types[1] int page_mix_type = const.page_mix_type int page_data_type = const.page_data_type int subheader_pointers_offset = const.subheader_pointers_offset # Copy of subheader_signature_to_index that allows for much faster lookups. # Lookups are done in get_subheader_index. The C structures are initialized # in _init_subheader_signatures(). uint32_t subheader_signatures_32bit[13] int subheader_indices_32bit[13] uint64_t subheader_signatures_64bit[17] int subheader_indices_64bit[17] int data_subheader_index = const.SASIndex.data_subheader_index def _init_subheader_signatures(): subheaders_32bit = [ (sig, idx) for sig, idx in const.subheader_signature_to_index.items() if len(sig) == 4 ] subheaders_64bit = [ (sig, idx) for sig, idx in const.subheader_signature_to_index.items() if len(sig) == 8 ] assert len(subheaders_32bit) == 13 assert len(subheaders_64bit) == 17 assert len(const.subheader_signature_to_index) == 13 + 17 for i, (signature, idx) in enumerate(subheaders_32bit): subheader_signatures_32bit[i] = (signature)[0] subheader_indices_32bit[i] = idx for i, (signature, idx) in enumerate(subheaders_64bit): subheader_signatures_64bit[i] = (signature)[0] subheader_indices_64bit[i] = idx _init_subheader_signatures() def get_subheader_index(bytes signature): """Fast version of 'subheader_signature_to_index.get(signature)'.""" cdef: uint32_t sig32 uint64_t sig64 Py_ssize_t i assert len(signature) in (4, 8) if len(signature) == 4: sig32 = (signature)[0] for i in range(len(subheader_signatures_32bit)): if subheader_signatures_32bit[i] == sig32: return subheader_indices_32bit[i] else: sig64 = (signature)[0] for i in range(len(subheader_signatures_64bit)): if subheader_signatures_64bit[i] == sig64: return subheader_indices_64bit[i] return data_subheader_index cdef class Parser: cdef: int column_count int64_t[:] lengths int64_t[:] offsets int64_t[:] column_types uint8_t[:, :] byte_chunk object[:, :] string_chunk uint8_t *cached_page int cached_page_len int current_row_on_page_index int current_page_block_count int current_page_data_subheader_pointers_len int current_page_subheaders_count int current_row_in_chunk_index int current_row_in_file_index bint blank_missing int header_length int row_length int bit_offset int subheader_pointer_length int current_page_type bint is_little_endian int (*decompress)(Buffer, Buffer) except? 0 object parser def __init__(self, object parser): cdef: int j char[:] column_types self.parser = parser self.blank_missing = parser.blank_missing self.header_length = self.parser.header_length self.column_count = parser.column_count self.lengths = parser.column_data_lengths() self.offsets = parser.column_data_offsets() self.byte_chunk = parser._byte_chunk self.string_chunk = parser._string_chunk self.row_length = parser.row_length self.bit_offset = self.parser._page_bit_offset self.subheader_pointer_length = self.parser._subheader_pointer_length self.is_little_endian = parser.byte_order == "<" self.column_types = np.empty(self.column_count, dtype="int64") # page indicators self.update_next_page() column_types = parser.column_types() # map column types for j in range(self.column_count): if column_types[j] == b"d": self.column_types[j] = column_type_decimal elif column_types[j] == b"s": self.column_types[j] = column_type_string else: raise ValueError(f"unknown column type: {self.parser.columns[j].ctype}") # compression if parser.compression == const.rle_compression: self.decompress = rle_decompress elif parser.compression == const.rdc_compression: self.decompress = rdc_decompress else: self.decompress = NULL # update to current state of the parser self.current_row_in_chunk_index = parser._current_row_in_chunk_index self.current_row_in_file_index = parser._current_row_in_file_index self.current_row_on_page_index = parser._current_row_on_page_index def read(self, int nrows): cdef: bint done Py_ssize_t i for i in range(nrows): done = self.readline() if done: break # update the parser self.parser._current_row_on_page_index = self.current_row_on_page_index self.parser._current_row_in_chunk_index = self.current_row_in_chunk_index self.parser._current_row_in_file_index = self.current_row_in_file_index cdef bint read_next_page(self) except? True: cdef bint done done = self.parser._read_next_page() if done: self.cached_page = NULL else: self.update_next_page() return done cdef update_next_page(self): # update data for the current page self.cached_page = self.parser._cached_page self.cached_page_len = len(self.parser._cached_page) self.current_row_on_page_index = 0 self.current_page_type = self.parser._current_page_type self.current_page_block_count = self.parser._current_page_block_count self.current_page_data_subheader_pointers_len = len( self.parser._current_page_data_subheader_pointers ) self.current_page_subheaders_count = self.parser._current_page_subheaders_count cdef bint readline(self) except? True: cdef: int offset, length, bit_offset, align_correction int subheader_pointer_length, mn bint done, flag bit_offset = self.bit_offset subheader_pointer_length = self.subheader_pointer_length # If there is no page, go to the end of the header and read a page. if self.cached_page == NULL: self.parser._path_or_buf.seek(self.header_length) done = self.read_next_page() if done: return True # Loop until a data row is read while True: if self.current_page_type in (page_meta_types_0, page_meta_types_1): flag = self.current_row_on_page_index >=\ self.current_page_data_subheader_pointers_len if flag: done = self.read_next_page() if done: return True continue offset, length = self.parser._current_page_data_subheader_pointers[ self.current_row_on_page_index ] self.process_byte_array_with_data(offset, length) return False elif self.current_page_type == page_mix_type: align_correction = ( bit_offset + subheader_pointers_offset + self.current_page_subheaders_count * subheader_pointer_length ) align_correction = align_correction % 8 offset = bit_offset + align_correction offset += subheader_pointers_offset offset += self.current_page_subheaders_count * subheader_pointer_length offset += self.current_row_on_page_index * self.row_length self.process_byte_array_with_data(offset, self.row_length) mn = min(self.parser.row_count, self.parser._mix_page_row_count) if self.current_row_on_page_index == mn: done = self.read_next_page() if done: return True return False elif self.current_page_type == page_data_type: self.process_byte_array_with_data( bit_offset + subheader_pointers_offset + self.current_row_on_page_index * self.row_length, self.row_length, ) flag = self.current_row_on_page_index == self.current_page_block_count if flag: done = self.read_next_page() if done: return True return False else: raise ValueError(f"unknown page type: {self.current_page_type}") cdef void process_byte_array_with_data(self, int offset, int length) except *: cdef: Py_ssize_t j int s, k, m, jb, js, current_row, rpos int64_t lngt, start, ct Buffer source, decompressed_source int64_t[:] column_types int64_t[:] lengths int64_t[:] offsets uint8_t[:, :] byte_chunk object[:, :] string_chunk bint compressed assert offset + length <= self.cached_page_len, "Out of bounds read" source = Buffer(&self.cached_page[offset], length) compressed = self.decompress != NULL and length < self.row_length if compressed: decompressed_source = buf_new(self.row_length) rpos = self.decompress(source, decompressed_source) if rpos != self.row_length: raise ValueError( f"Expected decompressed line of length {self.row_length} bytes " f"but decompressed {rpos} bytes" ) source = decompressed_source current_row = self.current_row_in_chunk_index column_types = self.column_types lengths = self.lengths offsets = self.offsets byte_chunk = self.byte_chunk string_chunk = self.string_chunk s = 8 * self.current_row_in_chunk_index js = 0 jb = 0 for j in range(self.column_count): lngt = lengths[j] if lngt == 0: break start = offsets[j] ct = column_types[j] if ct == column_type_decimal: # decimal if self.is_little_endian: m = s + 8 - lngt else: m = s for k in range(lngt): byte_chunk[jb, m + k] = buf_get(source, start + k) jb += 1 elif column_types[j] == column_type_string: # string # Skip trailing whitespace. This is equivalent to calling # .rstrip(b"\x00 ") but without Python call overhead. while lngt > 0 and buf_get(source, start + lngt - 1) in b"\x00 ": lngt -= 1 if lngt == 0 and self.blank_missing: string_chunk[js, current_row] = np_nan else: string_chunk[js, current_row] = buf_as_bytes(source, start, lngt) js += 1 self.current_row_on_page_index += 1 self.current_row_in_chunk_index += 1 self.current_row_in_file_index += 1 if compressed: buf_free(decompressed_source)