Inzynierka_Gwiazdy/machine_learning/Lib/site-packages/scipy/io/_idl.py
2023-09-20 19:46:58 +02:00

915 lines
26 KiB
Python

# IDLSave - a python module to read IDL 'save' files
# Copyright (c) 2010 Thomas P. Robitaille
# Many thanks to Craig Markwardt for publishing the Unofficial Format
# Specification for IDL .sav files, without which this Python module would not
# exist (http://cow.physics.wisc.edu/~craigm/idl/savefmt).
# This code was developed by with permission from ITT Visual Information
# Systems. IDL(r) is a registered trademark of ITT Visual Information Systems,
# Inc. for their Interactive Data Language software.
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
__all__ = ['readsav']
import struct
import numpy as np
import tempfile
import zlib
import warnings
# Define the different data types that can be found in an IDL save file
DTYPE_DICT = {1: '>u1',
2: '>i2',
3: '>i4',
4: '>f4',
5: '>f8',
6: '>c8',
7: '|O',
8: '|O',
9: '>c16',
10: '|O',
11: '|O',
12: '>u2',
13: '>u4',
14: '>i8',
15: '>u8'}
# Define the different record types that can be found in an IDL save file
RECTYPE_DICT = {0: "START_MARKER",
1: "COMMON_VARIABLE",
2: "VARIABLE",
3: "SYSTEM_VARIABLE",
6: "END_MARKER",
10: "TIMESTAMP",
12: "COMPILED",
13: "IDENTIFICATION",
14: "VERSION",
15: "HEAP_HEADER",
16: "HEAP_DATA",
17: "PROMOTE64",
19: "NOTICE",
20: "DESCRIPTION"}
# Define a dictionary to contain structure definitions
STRUCT_DICT = {}
def _align_32(f):
'''Align to the next 32-bit position in a file'''
pos = f.tell()
if pos % 4 != 0:
f.seek(pos + 4 - pos % 4)
return
def _skip_bytes(f, n):
'''Skip `n` bytes'''
f.read(n)
return
def _read_bytes(f, n):
'''Read the next `n` bytes'''
return f.read(n)
def _read_byte(f):
'''Read a single byte'''
return np.uint8(struct.unpack('>B', f.read(4)[:1])[0])
def _read_long(f):
'''Read a signed 32-bit integer'''
return np.int32(struct.unpack('>l', f.read(4))[0])
def _read_int16(f):
'''Read a signed 16-bit integer'''
return np.int16(struct.unpack('>h', f.read(4)[2:4])[0])
def _read_int32(f):
'''Read a signed 32-bit integer'''
return np.int32(struct.unpack('>i', f.read(4))[0])
def _read_int64(f):
'''Read a signed 64-bit integer'''
return np.int64(struct.unpack('>q', f.read(8))[0])
def _read_uint16(f):
'''Read an unsigned 16-bit integer'''
return np.uint16(struct.unpack('>H', f.read(4)[2:4])[0])
def _read_uint32(f):
'''Read an unsigned 32-bit integer'''
return np.uint32(struct.unpack('>I', f.read(4))[0])
def _read_uint64(f):
'''Read an unsigned 64-bit integer'''
return np.uint64(struct.unpack('>Q', f.read(8))[0])
def _read_float32(f):
'''Read a 32-bit float'''
return np.float32(struct.unpack('>f', f.read(4))[0])
def _read_float64(f):
'''Read a 64-bit float'''
return np.float64(struct.unpack('>d', f.read(8))[0])
class Pointer:
'''Class used to define pointers'''
def __init__(self, index):
self.index = index
return
class ObjectPointer(Pointer):
'''Class used to define object pointers'''
pass
def _read_string(f):
'''Read a string'''
length = _read_long(f)
if length > 0:
chars = _read_bytes(f, length).decode('latin1')
_align_32(f)
else:
chars = ''
return chars
def _read_string_data(f):
'''Read a data string (length is specified twice)'''
length = _read_long(f)
if length > 0:
length = _read_long(f)
string_data = _read_bytes(f, length)
_align_32(f)
else:
string_data = ''
return string_data
def _read_data(f, dtype):
'''Read a variable with a specified data type'''
if dtype == 1:
if _read_int32(f) != 1:
raise Exception("Error occurred while reading byte variable")
return _read_byte(f)
elif dtype == 2:
return _read_int16(f)
elif dtype == 3:
return _read_int32(f)
elif dtype == 4:
return _read_float32(f)
elif dtype == 5:
return _read_float64(f)
elif dtype == 6:
real = _read_float32(f)
imag = _read_float32(f)
return np.complex64(real + imag * 1j)
elif dtype == 7:
return _read_string_data(f)
elif dtype == 8:
raise Exception("Should not be here - please report this")
elif dtype == 9:
real = _read_float64(f)
imag = _read_float64(f)
return np.complex128(real + imag * 1j)
elif dtype == 10:
return Pointer(_read_int32(f))
elif dtype == 11:
return ObjectPointer(_read_int32(f))
elif dtype == 12:
return _read_uint16(f)
elif dtype == 13:
return _read_uint32(f)
elif dtype == 14:
return _read_int64(f)
elif dtype == 15:
return _read_uint64(f)
else:
raise Exception("Unknown IDL type: %i - please report this" % dtype)
def _read_structure(f, array_desc, struct_desc):
'''
Read a structure, with the array and structure descriptors given as
`array_desc` and `structure_desc` respectively.
'''
nrows = array_desc['nelements']
columns = struct_desc['tagtable']
dtype = []
for col in columns:
if col['structure'] or col['array']:
dtype.append(((col['name'].lower(), col['name']), np.object_))
else:
if col['typecode'] in DTYPE_DICT:
dtype.append(((col['name'].lower(), col['name']),
DTYPE_DICT[col['typecode']]))
else:
raise Exception("Variable type %i not implemented" %
col['typecode'])
structure = np.recarray((nrows, ), dtype=dtype)
for i in range(nrows):
for col in columns:
dtype = col['typecode']
if col['structure']:
structure[col['name']][i] = _read_structure(f,
struct_desc['arrtable'][col['name']],
struct_desc['structtable'][col['name']])
elif col['array']:
structure[col['name']][i] = _read_array(f, dtype,
struct_desc['arrtable'][col['name']])
else:
structure[col['name']][i] = _read_data(f, dtype)
# Reshape structure if needed
if array_desc['ndims'] > 1:
dims = array_desc['dims'][:int(array_desc['ndims'])]
dims.reverse()
structure = structure.reshape(dims)
return structure
def _read_array(f, typecode, array_desc):
'''
Read an array of type `typecode`, with the array descriptor given as
`array_desc`.
'''
if typecode in [1, 3, 4, 5, 6, 9, 13, 14, 15]:
if typecode == 1:
nbytes = _read_int32(f)
if nbytes != array_desc['nbytes']:
warnings.warn("Not able to verify number of bytes from header")
# Read bytes as numpy array
array = np.frombuffer(f.read(array_desc['nbytes']),
dtype=DTYPE_DICT[typecode])
elif typecode in [2, 12]:
# These are 2 byte types, need to skip every two as they are not packed
array = np.frombuffer(f.read(array_desc['nbytes']*2),
dtype=DTYPE_DICT[typecode])[1::2]
else:
# Read bytes into list
array = []
for i in range(array_desc['nelements']):
dtype = typecode
data = _read_data(f, dtype)
array.append(data)
array = np.array(array, dtype=np.object_)
# Reshape array if needed
if array_desc['ndims'] > 1:
dims = array_desc['dims'][:int(array_desc['ndims'])]
dims.reverse()
array = array.reshape(dims)
# Go to next alignment position
_align_32(f)
return array
def _read_record(f):
'''Function to read in a full record'''
record = {'rectype': _read_long(f)}
nextrec = _read_uint32(f)
nextrec += _read_uint32(f) * 2**32
_skip_bytes(f, 4)
if not record['rectype'] in RECTYPE_DICT:
raise Exception("Unknown RECTYPE: %i" % record['rectype'])
record['rectype'] = RECTYPE_DICT[record['rectype']]
if record['rectype'] in ["VARIABLE", "HEAP_DATA"]:
if record['rectype'] == "VARIABLE":
record['varname'] = _read_string(f)
else:
record['heap_index'] = _read_long(f)
_skip_bytes(f, 4)
rectypedesc = _read_typedesc(f)
if rectypedesc['typecode'] == 0:
if nextrec == f.tell():
record['data'] = None # Indicates NULL value
else:
raise ValueError("Unexpected type code: 0")
else:
varstart = _read_long(f)
if varstart != 7:
raise Exception("VARSTART is not 7")
if rectypedesc['structure']:
record['data'] = _read_structure(f, rectypedesc['array_desc'],
rectypedesc['struct_desc'])
elif rectypedesc['array']:
record['data'] = _read_array(f, rectypedesc['typecode'],
rectypedesc['array_desc'])
else:
dtype = rectypedesc['typecode']
record['data'] = _read_data(f, dtype)
elif record['rectype'] == "TIMESTAMP":
_skip_bytes(f, 4*256)
record['date'] = _read_string(f)
record['user'] = _read_string(f)
record['host'] = _read_string(f)
elif record['rectype'] == "VERSION":
record['format'] = _read_long(f)
record['arch'] = _read_string(f)
record['os'] = _read_string(f)
record['release'] = _read_string(f)
elif record['rectype'] == "IDENTIFICATON":
record['author'] = _read_string(f)
record['title'] = _read_string(f)
record['idcode'] = _read_string(f)
elif record['rectype'] == "NOTICE":
record['notice'] = _read_string(f)
elif record['rectype'] == "DESCRIPTION":
record['description'] = _read_string_data(f)
elif record['rectype'] == "HEAP_HEADER":
record['nvalues'] = _read_long(f)
record['indices'] = [_read_long(f) for _ in range(record['nvalues'])]
elif record['rectype'] == "COMMONBLOCK":
record['nvars'] = _read_long(f)
record['name'] = _read_string(f)
record['varnames'] = [_read_string(f) for _ in range(record['nvars'])]
elif record['rectype'] == "END_MARKER":
record['end'] = True
elif record['rectype'] == "UNKNOWN":
warnings.warn("Skipping UNKNOWN record")
elif record['rectype'] == "SYSTEM_VARIABLE":
warnings.warn("Skipping SYSTEM_VARIABLE record")
else:
raise Exception("record['rectype']=%s not implemented" %
record['rectype'])
f.seek(nextrec)
return record
def _read_typedesc(f):
'''Function to read in a type descriptor'''
typedesc = {'typecode': _read_long(f), 'varflags': _read_long(f)}
if typedesc['varflags'] & 2 == 2:
raise Exception("System variables not implemented")
typedesc['array'] = typedesc['varflags'] & 4 == 4
typedesc['structure'] = typedesc['varflags'] & 32 == 32
if typedesc['structure']:
typedesc['array_desc'] = _read_arraydesc(f)
typedesc['struct_desc'] = _read_structdesc(f)
elif typedesc['array']:
typedesc['array_desc'] = _read_arraydesc(f)
return typedesc
def _read_arraydesc(f):
'''Function to read in an array descriptor'''
arraydesc = {'arrstart': _read_long(f)}
if arraydesc['arrstart'] == 8:
_skip_bytes(f, 4)
arraydesc['nbytes'] = _read_long(f)
arraydesc['nelements'] = _read_long(f)
arraydesc['ndims'] = _read_long(f)
_skip_bytes(f, 8)
arraydesc['nmax'] = _read_long(f)
arraydesc['dims'] = [_read_long(f) for _ in range(arraydesc['nmax'])]
elif arraydesc['arrstart'] == 18:
warnings.warn("Using experimental 64-bit array read")
_skip_bytes(f, 8)
arraydesc['nbytes'] = _read_uint64(f)
arraydesc['nelements'] = _read_uint64(f)
arraydesc['ndims'] = _read_long(f)
_skip_bytes(f, 8)
arraydesc['nmax'] = 8
arraydesc['dims'] = []
for d in range(arraydesc['nmax']):
v = _read_long(f)
if v != 0:
raise Exception("Expected a zero in ARRAY_DESC")
arraydesc['dims'].append(_read_long(f))
else:
raise Exception("Unknown ARRSTART: %i" % arraydesc['arrstart'])
return arraydesc
def _read_structdesc(f):
'''Function to read in a structure descriptor'''
structdesc = {}
structstart = _read_long(f)
if structstart != 9:
raise Exception("STRUCTSTART should be 9")
structdesc['name'] = _read_string(f)
predef = _read_long(f)
structdesc['ntags'] = _read_long(f)
structdesc['nbytes'] = _read_long(f)
structdesc['predef'] = predef & 1
structdesc['inherits'] = predef & 2
structdesc['is_super'] = predef & 4
if not structdesc['predef']:
structdesc['tagtable'] = [_read_tagdesc(f)
for _ in range(structdesc['ntags'])]
for tag in structdesc['tagtable']:
tag['name'] = _read_string(f)
structdesc['arrtable'] = {tag['name']: _read_arraydesc(f)
for tag in structdesc['tagtable']
if tag['array']}
structdesc['structtable'] = {tag['name']: _read_structdesc(f)
for tag in structdesc['tagtable']
if tag['structure']}
if structdesc['inherits'] or structdesc['is_super']:
structdesc['classname'] = _read_string(f)
structdesc['nsupclasses'] = _read_long(f)
structdesc['supclassnames'] = [
_read_string(f) for _ in range(structdesc['nsupclasses'])]
structdesc['supclasstable'] = [
_read_structdesc(f) for _ in range(structdesc['nsupclasses'])]
STRUCT_DICT[structdesc['name']] = structdesc
else:
if not structdesc['name'] in STRUCT_DICT:
raise Exception("PREDEF=1 but can't find definition")
structdesc = STRUCT_DICT[structdesc['name']]
return structdesc
def _read_tagdesc(f):
'''Function to read in a tag descriptor'''
tagdesc = {'offset': _read_long(f)}
if tagdesc['offset'] == -1:
tagdesc['offset'] = _read_uint64(f)
tagdesc['typecode'] = _read_long(f)
tagflags = _read_long(f)
tagdesc['array'] = tagflags & 4 == 4
tagdesc['structure'] = tagflags & 32 == 32
tagdesc['scalar'] = tagdesc['typecode'] in DTYPE_DICT
# Assume '10'x is scalar
return tagdesc
def _replace_heap(variable, heap):
if isinstance(variable, Pointer):
while isinstance(variable, Pointer):
if variable.index == 0:
variable = None
else:
if variable.index in heap:
variable = heap[variable.index]
else:
warnings.warn("Variable referenced by pointer not found "
"in heap: variable will be set to None")
variable = None
replace, new = _replace_heap(variable, heap)
if replace:
variable = new
return True, variable
elif isinstance(variable, np.core.records.recarray):
# Loop over records
for ir, record in enumerate(variable):
replace, new = _replace_heap(record, heap)
if replace:
variable[ir] = new
return False, variable
elif isinstance(variable, np.core.records.record):
# Loop over values
for iv, value in enumerate(variable):
replace, new = _replace_heap(value, heap)
if replace:
variable[iv] = new
return False, variable
elif isinstance(variable, np.ndarray):
# Loop over values if type is np.object_
if variable.dtype.type is np.object_:
for iv in range(variable.size):
replace, new = _replace_heap(variable.item(iv), heap)
if replace:
variable.itemset(iv, new)
return False, variable
else:
return False, variable
class AttrDict(dict):
'''
A case-insensitive dictionary with access via item, attribute, and call
notations:
>>> d = AttrDict()
>>> d['Variable'] = 123
>>> d['Variable']
123
>>> d.Variable
123
>>> d.variable
123
>>> d('VARIABLE')
123
>>> d['missing']
Traceback (most recent error last):
...
KeyError: 'missing'
>>> d.missing
Traceback (most recent error last):
...
AttributeError: 'AttrDict' object has no attribute 'missing'
'''
def __init__(self, init={}):
dict.__init__(self, init)
def __getitem__(self, name):
return super().__getitem__(name.lower())
def __setitem__(self, key, value):
return super().__setitem__(key.lower(), value)
def __getattr__(self, name):
try:
return self.__getitem__(name)
except KeyError:
raise AttributeError(
f"'{type(self)}' object has no attribute '{name}'") from None
__setattr__ = __setitem__
__call__ = __getitem__
def readsav(file_name, idict=None, python_dict=False,
uncompressed_file_name=None, verbose=False):
"""
Read an IDL .sav file.
Parameters
----------
file_name : str
Name of the IDL save file.
idict : dict, optional
Dictionary in which to insert .sav file variables.
python_dict : bool, optional
By default, the object return is not a Python dictionary, but a
case-insensitive dictionary with item, attribute, and call access
to variables. To get a standard Python dictionary, set this option
to True.
uncompressed_file_name : str, optional
This option only has an effect for .sav files written with the
/compress option. If a file name is specified, compressed .sav
files are uncompressed to this file. Otherwise, readsav will use
the `tempfile` module to determine a temporary filename
automatically, and will remove the temporary file upon successfully
reading it in.
verbose : bool, optional
Whether to print out information about the save file, including
the records read, and available variables.
Returns
-------
idl_dict : AttrDict or dict
If `python_dict` is set to False (default), this function returns a
case-insensitive dictionary with item, attribute, and call access
to variables. If `python_dict` is set to True, this function
returns a Python dictionary with all variable names in lowercase.
If `idict` was specified, then variables are written to the
dictionary specified, and the updated dictionary is returned.
Examples
--------
>>> from os.path import dirname, join as pjoin
>>> import scipy.io as sio
>>> from scipy.io import readsav
Get the filename for an example .sav file from the tests/data directory.
>>> data_dir = pjoin(dirname(sio.__file__), 'tests', 'data')
>>> sav_fname = pjoin(data_dir, 'array_float32_1d.sav')
Load the .sav file contents.
>>> sav_data = readsav(sav_fname)
Get keys of the .sav file contents.
>>> print(sav_data.keys())
dict_keys(['array1d'])
Access a content with a key.
>>> print(sav_data['array1d'])
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0.]
"""
# Initialize record and variable holders
records = []
if python_dict or idict:
variables = {}
else:
variables = AttrDict()
# Open the IDL file
f = open(file_name, 'rb')
# Read the signature, which should be 'SR'
signature = _read_bytes(f, 2)
if signature != b'SR':
raise Exception("Invalid SIGNATURE: %s" % signature)
# Next, the record format, which is '\x00\x04' for normal .sav
# files, and '\x00\x06' for compressed .sav files.
recfmt = _read_bytes(f, 2)
if recfmt == b'\x00\x04':
pass
elif recfmt == b'\x00\x06':
if verbose:
print("IDL Save file is compressed")
if uncompressed_file_name:
fout = open(uncompressed_file_name, 'w+b')
else:
fout = tempfile.NamedTemporaryFile(suffix='.sav')
if verbose:
print(" -> expanding to %s" % fout.name)
# Write header
fout.write(b'SR\x00\x04')
# Cycle through records
while True:
# Read record type
rectype = _read_long(f)
fout.write(struct.pack('>l', int(rectype)))
# Read position of next record and return as int
nextrec = _read_uint32(f)
nextrec += _read_uint32(f) * 2**32
# Read the unknown 4 bytes
unknown = f.read(4)
# Check if the end of the file has been reached
if RECTYPE_DICT[rectype] == 'END_MARKER':
modval = np.int64(2**32)
fout.write(struct.pack('>I', int(nextrec) % modval))
fout.write(struct.pack('>I', int((nextrec - (nextrec % modval)) / modval)))
fout.write(unknown)
break
# Find current position
pos = f.tell()
# Decompress record
rec_string = zlib.decompress(f.read(nextrec-pos))
# Find new position of next record
nextrec = fout.tell() + len(rec_string) + 12
# Write out record
fout.write(struct.pack('>I', int(nextrec % 2**32)))
fout.write(struct.pack('>I', int((nextrec - (nextrec % 2**32)) / 2**32)))
fout.write(unknown)
fout.write(rec_string)
# Close the original compressed file
f.close()
# Set f to be the decompressed file, and skip the first four bytes
f = fout
f.seek(4)
else:
raise Exception("Invalid RECFMT: %s" % recfmt)
# Loop through records, and add them to the list
while True:
r = _read_record(f)
records.append(r)
if 'end' in r:
if r['end']:
break
# Close the file
f.close()
# Find heap data variables
heap = {}
for r in records:
if r['rectype'] == "HEAP_DATA":
heap[r['heap_index']] = r['data']
# Find all variables
for r in records:
if r['rectype'] == "VARIABLE":
replace, new = _replace_heap(r['data'], heap)
if replace:
r['data'] = new
variables[r['varname'].lower()] = r['data']
if verbose:
# Print out timestamp info about the file
for record in records:
if record['rectype'] == "TIMESTAMP":
print("-"*50)
print("Date: %s" % record['date'])
print("User: %s" % record['user'])
print("Host: %s" % record['host'])
break
# Print out version info about the file
for record in records:
if record['rectype'] == "VERSION":
print("-"*50)
print("Format: %s" % record['format'])
print("Architecture: %s" % record['arch'])
print("Operating System: %s" % record['os'])
print("IDL Version: %s" % record['release'])
break
# Print out identification info about the file
for record in records:
if record['rectype'] == "IDENTIFICATON":
print("-"*50)
print("Author: %s" % record['author'])
print("Title: %s" % record['title'])
print("ID Code: %s" % record['idcode'])
break
# Print out descriptions saved with the file
for record in records:
if record['rectype'] == "DESCRIPTION":
print("-"*50)
print("Description: %s" % record['description'])
break
print("-"*50)
print("Successfully read %i records of which:" %
(len(records)))
# Create convenience list of record types
rectypes = [r['rectype'] for r in records]
for rt in set(rectypes):
if rt != 'END_MARKER':
print(" - %i are of type %s" % (rectypes.count(rt), rt))
print("-"*50)
if 'VARIABLE' in rectypes:
print("Available variables:")
for var in variables:
print(" - %s [%s]" % (var, type(variables[var])))
print("-"*50)
if idict:
for var in variables:
idict[var] = variables[var]
return idict
else:
return variables