This commit is contained in:
2024-11-29 18:15:30 +00:00
parent 40aade2d8e
commit bc9415586e
5298 changed files with 1938676 additions and 80 deletions

View File

@ -0,0 +1,116 @@
"""
==================================
Input and output (:mod:`scipy.io`)
==================================
.. currentmodule:: scipy.io
SciPy has many modules, classes, and functions available to read data
from and write data to a variety of file formats.
.. seealso:: `NumPy IO routines <https://www.numpy.org/devdocs/reference/routines.io.html>`__
MATLAB® files
=============
.. autosummary::
:toctree: generated/
loadmat - Read a MATLAB style mat file (version 4 through 7.1)
savemat - Write a MATLAB style mat file (version 4 through 7.1)
whosmat - List contents of a MATLAB style mat file (version 4 through 7.1)
For low-level MATLAB reading and writing utilities, see `scipy.io.matlab`.
IDL® files
==========
.. autosummary::
:toctree: generated/
readsav - Read an IDL 'save' file
Matrix Market files
===================
.. autosummary::
:toctree: generated/
mminfo - Query matrix info from Matrix Market formatted file
mmread - Read matrix from Matrix Market formatted file
mmwrite - Write matrix to Matrix Market formatted file
Unformatted Fortran files
===============================
.. autosummary::
:toctree: generated/
FortranFile - A file object for unformatted sequential Fortran files
FortranEOFError - Exception indicating the end of a well-formed file
FortranFormattingError - Exception indicating an inappropriate end
Netcdf
======
.. autosummary::
:toctree: generated/
netcdf_file - A file object for NetCDF data
netcdf_variable - A data object for the netcdf module
Harwell-Boeing files
====================
.. autosummary::
:toctree: generated/
hb_read -- read H-B file
hb_write -- write H-B file
Wav sound files (:mod:`scipy.io.wavfile`)
=========================================
.. module:: scipy.io.wavfile
.. autosummary::
:toctree: generated/
read
write
WavFileWarning
Arff files (:mod:`scipy.io.arff`)
=================================
.. module:: scipy.io.arff
.. autosummary::
:toctree: generated/
loadarff
MetaData
ArffError
ParseArffError
"""
# matfile read and write
from .matlab import loadmat, savemat, whosmat
# netCDF file support
from ._netcdf import netcdf_file, netcdf_variable
# Fortran file support
from ._fortran import FortranFile, FortranEOFError, FortranFormattingError
from ._fast_matrix_market import mminfo, mmread, mmwrite
from ._idl import readsav
from ._harwell_boeing import hb_read, hb_write
# Deprecated namespaces, to be removed in v2.0.0
from . import arff, harwell_boeing, idl, mmio, netcdf, wavfile
__all__ = [s for s in dir() if not s.startswith('_')]
from scipy._lib._testutils import PytestTester
test = PytestTester(__name__)
del PytestTester

View File

@ -0,0 +1,594 @@
# Copyright (C) 2022-2023 Adam Lugowski. All rights reserved.
# Use of this source code is governed by the BSD 2-clause license found in
# the LICENSE.txt file.
# SPDX-License-Identifier: BSD-2-Clause
"""
Matrix Market I/O with a C++ backend.
See http://math.nist.gov/MatrixMarket/formats.html
for information about the Matrix Market format.
.. versionadded:: 1.12.0
"""
import io
import os
import numpy as np
import scipy.sparse
from scipy.io import _mmio
__all__ = ['mminfo', 'mmread', 'mmwrite']
PARALLELISM = 0
"""
Number of threads that `mmread()` and `mmwrite()` use.
0 means number of CPUs in the system.
Use `threadpoolctl` to set this value.
"""
ALWAYS_FIND_SYMMETRY = False
"""
Whether mmwrite() with symmetry='AUTO' will always search for symmetry
inside the matrix. This is scipy.io._mmio.mmwrite()'s default behavior,
but has a significant performance cost on large matrices.
"""
_field_to_dtype = {
"integer": "int64",
"unsigned-integer": "uint64",
"real": "float64",
"complex": "complex",
"pattern": "float64",
}
def _fmm_version():
from . import _fmm_core
return _fmm_core.__version__
# Register with threadpoolctl, if available
try:
import threadpoolctl
class _FMMThreadPoolCtlController(threadpoolctl.LibController):
user_api = "scipy"
internal_api = "scipy_mmio"
filename_prefixes = ("_fmm_core",)
def get_num_threads(self):
global PARALLELISM
return PARALLELISM
def set_num_threads(self, num_threads):
global PARALLELISM
PARALLELISM = num_threads
def get_version(self):
return _fmm_version
def set_additional_attributes(self):
pass
threadpoolctl.register(_FMMThreadPoolCtlController)
except (ImportError, AttributeError):
# threadpoolctl not installed or version too old
pass
class _TextToBytesWrapper(io.BufferedReader):
"""
Convert a TextIOBase string stream to a byte stream.
"""
def __init__(self, text_io_buffer, encoding=None, errors=None, **kwargs):
super().__init__(text_io_buffer, **kwargs)
self.encoding = encoding or text_io_buffer.encoding or 'utf-8'
self.errors = errors or text_io_buffer.errors or 'strict'
def __del__(self):
# do not close the wrapped stream
self.detach()
def _encoding_call(self, method_name, *args, **kwargs):
raw_method = getattr(self.raw, method_name)
val = raw_method(*args, **kwargs)
return val.encode(self.encoding, errors=self.errors)
def read(self, size=-1):
return self._encoding_call('read', size)
def read1(self, size=-1):
return self._encoding_call('read1', size)
def peek(self, size=-1):
return self._encoding_call('peek', size)
def seek(self, offset, whence=0):
# Random seeks are not allowed because of non-trivial conversion
# between byte and character offsets,
# with the possibility of a byte offset landing within a character.
if offset == 0 and whence == 0 or \
offset == 0 and whence == 2:
# seek to start or end is ok
super().seek(offset, whence)
else:
# Drop any other seek
# In this application this may happen when pystreambuf seeks during sync(),
# which can happen when closing a partially-read stream.
# Ex. when mminfo() only reads the header then exits.
pass
def _read_body_array(cursor):
"""
Read MatrixMarket array body
"""
from . import _fmm_core
vals = np.zeros(cursor.header.shape, dtype=_field_to_dtype.get(cursor.header.field))
_fmm_core.read_body_array(cursor, vals)
return vals
def _read_body_coo(cursor, generalize_symmetry=True):
"""
Read MatrixMarket coordinate body
"""
from . import _fmm_core
index_dtype = "int32"
if cursor.header.nrows >= 2**31 or cursor.header.ncols >= 2**31:
# Dimensions are too large to fit in int32
index_dtype = "int64"
i = np.zeros(cursor.header.nnz, dtype=index_dtype)
j = np.zeros(cursor.header.nnz, dtype=index_dtype)
data = np.zeros(cursor.header.nnz, dtype=_field_to_dtype.get(cursor.header.field))
_fmm_core.read_body_coo(cursor, i, j, data)
if generalize_symmetry and cursor.header.symmetry != "general":
off_diagonal_mask = (i != j)
off_diagonal_rows = i[off_diagonal_mask]
off_diagonal_cols = j[off_diagonal_mask]
off_diagonal_data = data[off_diagonal_mask]
if cursor.header.symmetry == "skew-symmetric":
off_diagonal_data *= -1
elif cursor.header.symmetry == "hermitian":
off_diagonal_data = off_diagonal_data.conjugate()
i = np.concatenate((i, off_diagonal_cols))
j = np.concatenate((j, off_diagonal_rows))
data = np.concatenate((data, off_diagonal_data))
return (data, (i, j)), cursor.header.shape
def _get_read_cursor(source, parallelism=None):
"""
Open file for reading.
"""
from . import _fmm_core
ret_stream_to_close = None
if parallelism is None:
parallelism = PARALLELISM
try:
source = os.fspath(source)
# It's a file path
is_path = True
except TypeError:
is_path = False
if is_path:
path = str(source)
if path.endswith('.gz'):
import gzip
source = gzip.GzipFile(path, 'r')
ret_stream_to_close = source
elif path.endswith('.bz2'):
import bz2
source = bz2.BZ2File(path, 'rb')
ret_stream_to_close = source
else:
return _fmm_core.open_read_file(path, parallelism), ret_stream_to_close
# Stream object.
if hasattr(source, "read"):
if isinstance(source, io.TextIOBase):
source = _TextToBytesWrapper(source)
return _fmm_core.open_read_stream(source, parallelism), ret_stream_to_close
else:
raise TypeError("Unknown source type")
def _get_write_cursor(target, h=None, comment=None, parallelism=None,
symmetry="general", precision=None):
"""
Open file for writing.
"""
from . import _fmm_core
if parallelism is None:
parallelism = PARALLELISM
if comment is None:
comment = ''
if symmetry is None:
symmetry = "general"
if precision is None:
precision = -1
if not h:
h = _fmm_core.header(comment=comment, symmetry=symmetry)
try:
target = os.fspath(target)
# It's a file path
return _fmm_core.open_write_file(str(target), h, parallelism, precision)
except TypeError:
pass
if hasattr(target, "write"):
# Stream object.
if isinstance(target, io.TextIOBase):
raise TypeError("target stream must be open in binary mode.")
return _fmm_core.open_write_stream(target, h, parallelism, precision)
else:
raise TypeError("Unknown source object")
def _apply_field(data, field, no_pattern=False):
"""
Ensure that ``data.dtype`` is compatible with the specified MatrixMarket field type.
Parameters
----------
data : ndarray
Input array.
field : str
Matrix Market field, such as 'real', 'complex', 'integer', 'pattern'.
no_pattern : bool, optional
Whether an empty array may be returned for a 'pattern' field.
Returns
-------
data : ndarray
Input data if no conversion necessary, or a converted version
"""
if field is None:
return data
if field == "pattern":
if no_pattern:
return data
else:
return np.zeros(0)
dtype = _field_to_dtype.get(field, None)
if dtype is None:
raise ValueError("Invalid field.")
return np.asarray(data, dtype=dtype)
def _validate_symmetry(symmetry):
"""
Check that the symmetry parameter is one that MatrixMarket allows..
"""
if symmetry is None:
return "general"
symmetry = str(symmetry).lower()
symmetries = ["general", "symmetric", "skew-symmetric", "hermitian"]
if symmetry not in symmetries:
raise ValueError("Invalid symmetry. Must be one of: " + ", ".join(symmetries))
return symmetry
def mmread(source):
"""
Reads the contents of a Matrix Market file-like 'source' into a matrix.
Parameters
----------
source : str or file-like
Matrix Market filename (extensions .mtx, .mtz.gz)
or open file-like object.
Returns
-------
a : ndarray or coo_matrix
Dense or sparse matrix depending on the matrix format in the
Matrix Market file.
Notes
-----
.. versionchanged:: 1.12.0
C++ implementation.
Examples
--------
>>> from io import StringIO
>>> from scipy.io import mmread
>>> text = '''%%MatrixMarket matrix coordinate real general
... 5 5 7
... 2 3 1.0
... 3 4 2.0
... 3 5 3.0
... 4 1 4.0
... 4 2 5.0
... 4 3 6.0
... 4 4 7.0
... '''
``mmread(source)`` returns the data as sparse matrix in COO format.
>>> m = mmread(StringIO(text))
>>> m
<COOrdinate sparse matrix of dtype 'float64'
with 7 stored elements and shape (5, 5)>
>>> m.toarray()
array([[0., 0., 0., 0., 0.],
[0., 0., 1., 0., 0.],
[0., 0., 0., 2., 3.],
[4., 5., 6., 7., 0.],
[0., 0., 0., 0., 0.]])
This method is threaded.
The default number of threads is equal to the number of CPUs in the system.
Use `threadpoolctl <https://github.com/joblib/threadpoolctl>`_ to override:
>>> import threadpoolctl
>>>
>>> with threadpoolctl.threadpool_limits(limits=2):
... m = mmread(StringIO(text))
"""
cursor, stream_to_close = _get_read_cursor(source)
if cursor.header.format == "array":
mat = _read_body_array(cursor)
if stream_to_close:
stream_to_close.close()
return mat
else:
from scipy.sparse import coo_matrix
triplet, shape = _read_body_coo(cursor, generalize_symmetry=True)
if stream_to_close:
stream_to_close.close()
return coo_matrix(triplet, shape=shape)
def mmwrite(target, a, comment=None, field=None, precision=None, symmetry="AUTO"):
r"""
Writes the sparse or dense array `a` to Matrix Market file-like `target`.
Parameters
----------
target : str or file-like
Matrix Market filename (extension .mtx) or open file-like object.
a : array like
Sparse or dense 2-D array.
comment : str, optional
Comments to be prepended to the Matrix Market file.
field : None or str, optional
Either 'real', 'complex', 'pattern', or 'integer'.
precision : None or int, optional
Number of digits to display for real or complex values.
symmetry : None or str, optional
Either 'AUTO', 'general', 'symmetric', 'skew-symmetric', or 'hermitian'.
If symmetry is None the symmetry type of 'a' is determined by its
values. If symmetry is 'AUTO' the symmetry type of 'a' is either
determined or set to 'general', at mmwrite's discretion.
Returns
-------
None
Notes
-----
.. versionchanged:: 1.12.0
C++ implementation.
Examples
--------
>>> from io import BytesIO
>>> import numpy as np
>>> from scipy.sparse import coo_matrix
>>> from scipy.io import mmwrite
Write a small NumPy array to a matrix market file. The file will be
written in the ``'array'`` format.
>>> a = np.array([[1.0, 0, 0, 0], [0, 2.5, 0, 6.25]])
>>> target = BytesIO()
>>> mmwrite(target, a)
>>> print(target.getvalue().decode('latin1'))
%%MatrixMarket matrix array real general
%
2 4
1
0
0
2.5
0
0
0
6.25
Add a comment to the output file, and set the precision to 3.
>>> target = BytesIO()
>>> mmwrite(target, a, comment='\n Some test data.\n', precision=3)
>>> print(target.getvalue().decode('latin1'))
%%MatrixMarket matrix array real general
%
% Some test data.
%
2 4
1.00e+00
0.00e+00
0.00e+00
2.50e+00
0.00e+00
0.00e+00
0.00e+00
6.25e+00
Convert to a sparse matrix before calling ``mmwrite``. This will
result in the output format being ``'coordinate'`` rather than
``'array'``.
>>> target = BytesIO()
>>> mmwrite(target, coo_matrix(a), precision=3)
>>> print(target.getvalue().decode('latin1'))
%%MatrixMarket matrix coordinate real general
%
2 4 3
1 1 1.00e+00
2 2 2.50e+00
2 4 6.25e+00
Write a complex Hermitian array to a matrix market file. Note that
only six values are actually written to the file; the other values
are implied by the symmetry.
>>> z = np.array([[3, 1+2j, 4-3j], [1-2j, 1, -5j], [4+3j, 5j, 2.5]])
>>> z
array([[ 3. +0.j, 1. +2.j, 4. -3.j],
[ 1. -2.j, 1. +0.j, -0. -5.j],
[ 4. +3.j, 0. +5.j, 2.5+0.j]])
>>> target = BytesIO()
>>> mmwrite(target, z, precision=2)
>>> print(target.getvalue().decode('latin1'))
%%MatrixMarket matrix array complex hermitian
%
3 3
3.0e+00 0.0e+00
1.0e+00 -2.0e+00
4.0e+00 3.0e+00
1.0e+00 0.0e+00
0.0e+00 5.0e+00
2.5e+00 0.0e+00
This method is threaded.
The default number of threads is equal to the number of CPUs in the system.
Use `threadpoolctl <https://github.com/joblib/threadpoolctl>`_ to override:
>>> import threadpoolctl
>>>
>>> target = BytesIO()
>>> with threadpoolctl.threadpool_limits(limits=2):
... mmwrite(target, a)
"""
from . import _fmm_core
if isinstance(a, list) or isinstance(a, tuple) or hasattr(a, "__array__"):
a = np.asarray(a)
if symmetry == "AUTO":
if ALWAYS_FIND_SYMMETRY or (hasattr(a, "shape") and max(a.shape) < 100):
symmetry = None
else:
symmetry = "general"
if symmetry is None:
symmetry = _mmio.MMFile()._get_symmetry(a)
symmetry = _validate_symmetry(symmetry)
cursor = _get_write_cursor(target, comment=comment,
precision=precision, symmetry=symmetry)
if isinstance(a, np.ndarray):
# Write dense numpy arrays
a = _apply_field(a, field, no_pattern=True)
_fmm_core.write_body_array(cursor, a)
elif scipy.sparse.issparse(a):
# Write sparse scipy matrices
a = a.tocoo()
if symmetry is not None and symmetry != "general":
# A symmetric matrix only specifies the elements below the diagonal.
# Ensure that the matrix satisfies this requirement.
from scipy.sparse import coo_array
lower_triangle_mask = a.row >= a.col
a = coo_array((a.data[lower_triangle_mask],
(a.row[lower_triangle_mask],
a.col[lower_triangle_mask])), shape=a.shape)
data = _apply_field(a.data, field)
_fmm_core.write_body_coo(cursor, a.shape, a.row, a.col, data)
else:
raise ValueError("unknown matrix type: %s" % type(a))
def mminfo(source):
"""
Return size and storage parameters from Matrix Market file-like 'source'.
Parameters
----------
source : str or file-like
Matrix Market filename (extension .mtx) or open file-like object
Returns
-------
rows : int
Number of matrix rows.
cols : int
Number of matrix columns.
entries : int
Number of non-zero entries of a sparse matrix
or rows*cols for a dense matrix.
format : str
Either 'coordinate' or 'array'.
field : str
Either 'real', 'complex', 'pattern', or 'integer'.
symmetry : str
Either 'general', 'symmetric', 'skew-symmetric', or 'hermitian'.
Notes
-----
.. versionchanged:: 1.12.0
C++ implementation.
Examples
--------
>>> from io import StringIO
>>> from scipy.io import mminfo
>>> text = '''%%MatrixMarket matrix coordinate real general
... 5 5 7
... 2 3 1.0
... 3 4 2.0
... 3 5 3.0
... 4 1 4.0
... 4 2 5.0
... 4 3 6.0
... 4 4 7.0
... '''
``mminfo(source)`` returns the number of rows, number of columns,
format, field type and symmetry attribute of the source file.
>>> mminfo(StringIO(text))
(5, 5, 7, 'coordinate', 'real', 'general')
"""
cursor, stream_to_close = _get_read_cursor(source, 1)
h = cursor.header
cursor.close()
if stream_to_close:
stream_to_close.close()
return h.nrows, h.ncols, h.nnz, h.format, h.field, h.symmetry

View File

@ -0,0 +1,354 @@
"""
Module to read / write Fortran unformatted sequential files.
This is in the spirit of code written by Neil Martinsen-Burrell and Joe Zuntz.
"""
import warnings
import numpy as np
__all__ = ['FortranFile', 'FortranEOFError', 'FortranFormattingError']
class FortranEOFError(TypeError, OSError):
"""Indicates that the file ended properly.
This error descends from TypeError because the code used to raise
TypeError (and this was the only way to know that the file had
ended) so users might have ``except TypeError:``.
"""
pass
class FortranFormattingError(TypeError, OSError):
"""Indicates that the file ended mid-record.
Descends from TypeError for backward compatibility.
"""
pass
class FortranFile:
"""
A file object for unformatted sequential files from Fortran code.
Parameters
----------
filename : file or str
Open file object or filename.
mode : {'r', 'w'}, optional
Read-write mode, default is 'r'.
header_dtype : dtype, optional
Data type of the header. Size and endianness must match the input/output file.
Notes
-----
These files are broken up into records of unspecified types. The size of
each record is given at the start (although the size of this header is not
standard) and the data is written onto disk without any formatting. Fortran
compilers supporting the BACKSPACE statement will write a second copy of
the size to facilitate backwards seeking.
This class only supports files written with both sizes for the record.
It also does not support the subrecords used in Intel and gfortran compilers
for records which are greater than 2GB with a 4-byte header.
An example of an unformatted sequential file in Fortran would be written as::
OPEN(1, FILE=myfilename, FORM='unformatted')
WRITE(1) myvariable
Since this is a non-standard file format, whose contents depend on the
compiler and the endianness of the machine, caution is advised. Files from
gfortran 4.8.0 and gfortran 4.1.2 on x86_64 are known to work.
Consider using Fortran direct-access files or files from the newer Stream
I/O, which can be easily read by `numpy.fromfile`.
Examples
--------
To create an unformatted sequential Fortran file:
>>> from scipy.io import FortranFile
>>> import numpy as np
>>> f = FortranFile('test.unf', 'w')
>>> f.write_record(np.array([1,2,3,4,5], dtype=np.int32))
>>> f.write_record(np.linspace(0,1,20).reshape((5,4)).T)
>>> f.close()
To read this file:
>>> f = FortranFile('test.unf', 'r')
>>> print(f.read_ints(np.int32))
[1 2 3 4 5]
>>> print(f.read_reals(float).reshape((5,4), order="F"))
[[0. 0.05263158 0.10526316 0.15789474]
[0.21052632 0.26315789 0.31578947 0.36842105]
[0.42105263 0.47368421 0.52631579 0.57894737]
[0.63157895 0.68421053 0.73684211 0.78947368]
[0.84210526 0.89473684 0.94736842 1. ]]
>>> f.close()
Or, in Fortran::
integer :: a(5), i
double precision :: b(5,4)
open(1, file='test.unf', form='unformatted')
read(1) a
read(1) b
close(1)
write(*,*) a
do i = 1, 5
write(*,*) b(i,:)
end do
"""
def __init__(self, filename, mode='r', header_dtype=np.uint32):
if header_dtype is None:
raise ValueError('Must specify dtype')
header_dtype = np.dtype(header_dtype)
if header_dtype.kind != 'u':
warnings.warn("Given a dtype which is not unsigned.", stacklevel=2)
if mode not in 'rw' or len(mode) != 1:
raise ValueError('mode must be either r or w')
if hasattr(filename, 'seek'):
self._fp = filename
else:
self._fp = open(filename, '%sb' % mode)
self._header_dtype = header_dtype
def _read_size(self, eof_ok=False):
n = self._header_dtype.itemsize
b = self._fp.read(n)
if (not b) and eof_ok:
raise FortranEOFError("End of file occurred at end of record")
elif len(b) < n:
raise FortranFormattingError(
"End of file in the middle of the record size")
return int(np.frombuffer(b, dtype=self._header_dtype, count=1)[0])
def write_record(self, *items):
"""
Write a record (including sizes) to the file.
Parameters
----------
*items : array_like
The data arrays to write.
Notes
-----
Writes data items to a file::
write_record(a.T, b.T, c.T, ...)
write(1) a, b, c, ...
Note that data in multidimensional arrays is written in
row-major order --- to make them read correctly by Fortran
programs, you need to transpose the arrays yourself when
writing them.
"""
items = tuple(np.asarray(item) for item in items)
total_size = sum(item.nbytes for item in items)
nb = np.array([total_size], dtype=self._header_dtype)
nb.tofile(self._fp)
for item in items:
item.tofile(self._fp)
nb.tofile(self._fp)
def read_record(self, *dtypes, **kwargs):
"""
Reads a record of a given type from the file.
Parameters
----------
*dtypes : dtypes, optional
Data type(s) specifying the size and endianness of the data.
Returns
-------
data : ndarray
A 1-D array object.
Raises
------
FortranEOFError
To signal that no further records are available
FortranFormattingError
To signal that the end of the file was encountered
part-way through a record
Notes
-----
If the record contains a multidimensional array, you can specify
the size in the dtype. For example::
INTEGER var(5,4)
can be read with::
read_record('(4,5)i4').T
Note that this function does **not** assume the file data is in Fortran
column major order, so you need to (i) swap the order of dimensions
when reading and (ii) transpose the resulting array.
Alternatively, you can read the data as a 1-D array and handle the
ordering yourself. For example::
read_record('i4').reshape(5, 4, order='F')
For records that contain several variables or mixed types (as opposed
to single scalar or array types), give them as separate arguments::
double precision :: a
integer :: b
write(1) a, b
record = f.read_record('<f4', '<i4')
a = record[0] # first number
b = record[1] # second number
and if any of the variables are arrays, the shape can be specified as
the third item in the relevant dtype::
double precision :: a
integer :: b(3,4)
write(1) a, b
record = f.read_record('<f4', np.dtype(('<i4', (4, 3))))
a = record[0]
b = record[1].T
NumPy also supports a short syntax for this kind of type::
record = f.read_record('<f4', '(3,3)<i4')
See Also
--------
read_reals
read_ints
"""
dtype = kwargs.pop('dtype', None)
if kwargs:
raise ValueError(f"Unknown keyword arguments {tuple(kwargs.keys())}")
if dtype is not None:
dtypes = dtypes + (dtype,)
elif not dtypes:
raise ValueError('Must specify at least one dtype')
first_size = self._read_size(eof_ok=True)
dtypes = tuple(np.dtype(dtype) for dtype in dtypes)
block_size = sum(dtype.itemsize for dtype in dtypes)
num_blocks, remainder = divmod(first_size, block_size)
if remainder != 0:
raise ValueError(f'Size obtained ({first_size}) is not a multiple of the '
f'dtypes given ({block_size}).')
if len(dtypes) != 1 and first_size != block_size:
# Fortran does not write mixed type array items in interleaved order,
# and it's not possible to guess the sizes of the arrays that were written.
# The user must specify the exact sizes of each of the arrays.
raise ValueError(f'Size obtained ({first_size}) does not match with the '
f'expected size ({block_size}) of multi-item record')
data = []
for dtype in dtypes:
r = np.fromfile(self._fp, dtype=dtype, count=num_blocks)
if len(r) != num_blocks:
raise FortranFormattingError(
"End of file in the middle of a record")
if dtype.shape != ():
# Squeeze outmost block dimension for array items
if num_blocks == 1:
assert r.shape == (1,) + dtype.shape
r = r[0]
data.append(r)
second_size = self._read_size()
if first_size != second_size:
raise ValueError('Sizes do not agree in the header and footer for '
'this record - check header dtype')
# Unpack result
if len(dtypes) == 1:
return data[0]
else:
return tuple(data)
def read_ints(self, dtype='i4'):
"""
Reads a record of a given type from the file, defaulting to an integer
type (``INTEGER*4`` in Fortran).
Parameters
----------
dtype : dtype, optional
Data type specifying the size and endianness of the data.
Returns
-------
data : ndarray
A 1-D array object.
See Also
--------
read_reals
read_record
"""
return self.read_record(dtype)
def read_reals(self, dtype='f8'):
"""
Reads a record of a given type from the file, defaulting to a floating
point number (``real*8`` in Fortran).
Parameters
----------
dtype : dtype, optional
Data type specifying the size and endianness of the data.
Returns
-------
data : ndarray
A 1-D array object.
See Also
--------
read_ints
read_record
"""
return self.read_record(dtype)
def close(self):
"""
Closes the file. It is unsupported to call any other methods off this
object after closing it. Note that this class supports the 'with'
statement in modern versions of Python, to call this automatically
"""
self._fp.close()
def __enter__(self):
return self
def __exit__(self, type, value, tb):
self.close()

View File

@ -0,0 +1,7 @@
from .hb import hb_read, hb_write
__all__ = ["hb_read", "hb_write"]
from scipy._lib._testutils import PytestTester
test = PytestTester(__name__)
del PytestTester

View File

@ -0,0 +1,309 @@
"""
Preliminary module to handle Fortran formats for IO. Does not use this outside
scipy.sparse io for now, until the API is deemed reasonable.
The *Format classes handle conversion between Fortran and Python format, and
FortranFormatParser can create *Format instances from raw Fortran format
strings (e.g. '(3I4)', '(10I3)', etc...)
"""
import re
import numpy as np
__all__ = ["BadFortranFormat", "FortranFormatParser", "IntFormat", "ExpFormat"]
TOKENS = {
"LPAR": r"\(",
"RPAR": r"\)",
"INT_ID": r"I",
"EXP_ID": r"E",
"INT": r"\d+",
"DOT": r"\.",
}
class BadFortranFormat(SyntaxError):
pass
def number_digits(n):
return int(np.floor(np.log10(np.abs(n))) + 1)
class IntFormat:
@classmethod
def from_number(cls, n, min=None):
"""Given an integer, returns a "reasonable" IntFormat instance to represent
any number between 0 and n if n > 0, -n and n if n < 0
Parameters
----------
n : int
max number one wants to be able to represent
min : int
minimum number of characters to use for the format
Returns
-------
res : IntFormat
IntFormat instance with reasonable (see Notes) computed width
Notes
-----
Reasonable should be understood as the minimal string length necessary
without losing precision. For example, IntFormat.from_number(1) will
return an IntFormat instance of width 2, so that any 0 and 1 may be
represented as 1-character strings without loss of information.
"""
width = number_digits(n) + 1
if n < 0:
width += 1
repeat = 80 // width
return cls(width, min, repeat=repeat)
def __init__(self, width, min=None, repeat=None):
self.width = width
self.repeat = repeat
self.min = min
def __repr__(self):
r = "IntFormat("
if self.repeat:
r += "%d" % self.repeat
r += "I%d" % self.width
if self.min:
r += ".%d" % self.min
return r + ")"
@property
def fortran_format(self):
r = "("
if self.repeat:
r += "%d" % self.repeat
r += "I%d" % self.width
if self.min:
r += ".%d" % self.min
return r + ")"
@property
def python_format(self):
return "%" + str(self.width) + "d"
class ExpFormat:
@classmethod
def from_number(cls, n, min=None):
"""Given a float number, returns a "reasonable" ExpFormat instance to
represent any number between -n and n.
Parameters
----------
n : float
max number one wants to be able to represent
min : int
minimum number of characters to use for the format
Returns
-------
res : ExpFormat
ExpFormat instance with reasonable (see Notes) computed width
Notes
-----
Reasonable should be understood as the minimal string length necessary
to avoid losing precision.
"""
# len of one number in exp format: sign + 1|0 + "." +
# number of digit for fractional part + 'E' + sign of exponent +
# len of exponent
finfo = np.finfo(n.dtype)
# Number of digits for fractional part
n_prec = finfo.precision + 1
# Number of digits for exponential part
n_exp = number_digits(np.max(np.abs([finfo.maxexp, finfo.minexp])))
width = 1 + 1 + n_prec + 1 + n_exp + 1
if n < 0:
width += 1
repeat = int(np.floor(80 / width))
return cls(width, n_prec, min, repeat=repeat)
def __init__(self, width, significand, min=None, repeat=None):
"""\
Parameters
----------
width : int
number of characters taken by the string (includes space).
"""
self.width = width
self.significand = significand
self.repeat = repeat
self.min = min
def __repr__(self):
r = "ExpFormat("
if self.repeat:
r += "%d" % self.repeat
r += "E%d.%d" % (self.width, self.significand)
if self.min:
r += "E%d" % self.min
return r + ")"
@property
def fortran_format(self):
r = "("
if self.repeat:
r += "%d" % self.repeat
r += "E%d.%d" % (self.width, self.significand)
if self.min:
r += "E%d" % self.min
return r + ")"
@property
def python_format(self):
return "%" + str(self.width-1) + "." + str(self.significand) + "E"
class Token:
def __init__(self, type, value, pos):
self.type = type
self.value = value
self.pos = pos
def __str__(self):
return f"""Token('{self.type}', "{self.value}")"""
def __repr__(self):
return self.__str__()
class Tokenizer:
def __init__(self):
self.tokens = list(TOKENS.keys())
self.res = [re.compile(TOKENS[i]) for i in self.tokens]
def input(self, s):
self.data = s
self.curpos = 0
self.len = len(s)
def next_token(self):
curpos = self.curpos
while curpos < self.len:
for i, r in enumerate(self.res):
m = r.match(self.data, curpos)
if m is None:
continue
else:
self.curpos = m.end()
return Token(self.tokens[i], m.group(), self.curpos)
raise SyntaxError("Unknown character at position %d (%s)"
% (self.curpos, self.data[curpos]))
# Grammar for fortran format:
# format : LPAR format_string RPAR
# format_string : repeated | simple
# repeated : repeat simple
# simple : int_fmt | exp_fmt
# int_fmt : INT_ID width
# exp_fmt : simple_exp_fmt
# simple_exp_fmt : EXP_ID width DOT significand
# extended_exp_fmt : EXP_ID width DOT significand EXP_ID ndigits
# repeat : INT
# width : INT
# significand : INT
# ndigits : INT
# Naive fortran formatter - parser is hand-made
class FortranFormatParser:
"""Parser for Fortran format strings. The parse method returns a *Format
instance.
Notes
-----
Only ExpFormat (exponential format for floating values) and IntFormat
(integer format) for now.
"""
def __init__(self):
self.tokenizer = Tokenizer()
def parse(self, s):
self.tokenizer.input(s)
tokens = []
try:
while True:
t = self.tokenizer.next_token()
if t is None:
break
else:
tokens.append(t)
return self._parse_format(tokens)
except SyntaxError as e:
raise BadFortranFormat(str(e)) from e
def _get_min(self, tokens):
next = tokens.pop(0)
if not next.type == "DOT":
raise SyntaxError()
next = tokens.pop(0)
return next.value
def _expect(self, token, tp):
if not token.type == tp:
raise SyntaxError()
def _parse_format(self, tokens):
if not tokens[0].type == "LPAR":
raise SyntaxError("Expected left parenthesis at position "
"%d (got '%s')" % (0, tokens[0].value))
elif not tokens[-1].type == "RPAR":
raise SyntaxError("Expected right parenthesis at position "
"%d (got '%s')" % (len(tokens), tokens[-1].value))
tokens = tokens[1:-1]
types = [t.type for t in tokens]
if types[0] == "INT":
repeat = int(tokens.pop(0).value)
else:
repeat = None
next = tokens.pop(0)
if next.type == "INT_ID":
next = self._next(tokens, "INT")
width = int(next.value)
if tokens:
min = int(self._get_min(tokens))
else:
min = None
return IntFormat(width, min, repeat)
elif next.type == "EXP_ID":
next = self._next(tokens, "INT")
width = int(next.value)
next = self._next(tokens, "DOT")
next = self._next(tokens, "INT")
significand = int(next.value)
if tokens:
next = self._next(tokens, "EXP_ID")
next = self._next(tokens, "INT")
min = int(next.value)
else:
min = None
return ExpFormat(width, significand, min, repeat)
else:
raise SyntaxError("Invalid formatter type %s" % next.value)
def _next(self, tokens, tp):
if not len(tokens) > 0:
raise SyntaxError()
next = tokens.pop(0)
self._expect(next, tp)
return next

View File

@ -0,0 +1,574 @@
"""
Implementation of Harwell-Boeing read/write.
At the moment not the full Harwell-Boeing format is supported. Supported
features are:
- assembled, non-symmetric, real matrices
- integer for pointer/indices
- exponential format for float values, and int format
"""
# TODO:
# - Add more support (symmetric/complex matrices, non-assembled matrices ?)
# XXX: reading is reasonably efficient (>= 85 % is in numpy.fromstring), but
# takes a lot of memory. Being faster would require compiled code.
# write is not efficient. Although not a terribly exciting task,
# having reusable facilities to efficiently read/write fortran-formatted files
# would be useful outside this module.
import warnings
import numpy as np
from scipy.sparse import csc_matrix
from ._fortran_format_parser import FortranFormatParser, IntFormat, ExpFormat
__all__ = ["hb_read", "hb_write"]
class MalformedHeader(Exception):
pass
class LineOverflow(Warning):
pass
def _nbytes_full(fmt, nlines):
"""Return the number of bytes to read to get every full lines for the
given parsed fortran format."""
return (fmt.repeat * fmt.width + 1) * (nlines - 1)
class HBInfo:
@classmethod
def from_data(cls, m, title="Default title", key="0", mxtype=None, fmt=None):
"""Create a HBInfo instance from an existing sparse matrix.
Parameters
----------
m : sparse matrix
the HBInfo instance will derive its parameters from m
title : str
Title to put in the HB header
key : str
Key
mxtype : HBMatrixType
type of the input matrix
fmt : dict
not implemented
Returns
-------
hb_info : HBInfo instance
"""
m = m.tocsc(copy=False)
pointer = m.indptr
indices = m.indices
values = m.data
nrows, ncols = m.shape
nnon_zeros = m.nnz
if fmt is None:
# +1 because HB use one-based indexing (Fortran), and we will write
# the indices /pointer as such
pointer_fmt = IntFormat.from_number(np.max(pointer+1))
indices_fmt = IntFormat.from_number(np.max(indices+1))
if values.dtype.kind in np.typecodes["AllFloat"]:
values_fmt = ExpFormat.from_number(-np.max(np.abs(values)))
elif values.dtype.kind in np.typecodes["AllInteger"]:
values_fmt = IntFormat.from_number(-np.max(np.abs(values)))
else:
message = f"type {values.dtype.kind} not implemented yet"
raise NotImplementedError(message)
else:
raise NotImplementedError("fmt argument not supported yet.")
if mxtype is None:
if not np.isrealobj(values):
raise ValueError("Complex values not supported yet")
if values.dtype.kind in np.typecodes["AllInteger"]:
tp = "integer"
elif values.dtype.kind in np.typecodes["AllFloat"]:
tp = "real"
else:
raise NotImplementedError("type %s for values not implemented"
% values.dtype)
mxtype = HBMatrixType(tp, "unsymmetric", "assembled")
else:
raise ValueError("mxtype argument not handled yet.")
def _nlines(fmt, size):
nlines = size // fmt.repeat
if nlines * fmt.repeat != size:
nlines += 1
return nlines
pointer_nlines = _nlines(pointer_fmt, pointer.size)
indices_nlines = _nlines(indices_fmt, indices.size)
values_nlines = _nlines(values_fmt, values.size)
total_nlines = pointer_nlines + indices_nlines + values_nlines
return cls(title, key,
total_nlines, pointer_nlines, indices_nlines, values_nlines,
mxtype, nrows, ncols, nnon_zeros,
pointer_fmt.fortran_format, indices_fmt.fortran_format,
values_fmt.fortran_format)
@classmethod
def from_file(cls, fid):
"""Create a HBInfo instance from a file object containing a matrix in the
HB format.
Parameters
----------
fid : file-like matrix
File or file-like object containing a matrix in the HB format.
Returns
-------
hb_info : HBInfo instance
"""
# First line
line = fid.readline().strip("\n")
if not len(line) > 72:
raise ValueError("Expected at least 72 characters for first line, "
"got: \n%s" % line)
title = line[:72]
key = line[72:]
# Second line
line = fid.readline().strip("\n")
if not len(line.rstrip()) >= 56:
raise ValueError("Expected at least 56 characters for second line, "
"got: \n%s" % line)
total_nlines = _expect_int(line[:14])
pointer_nlines = _expect_int(line[14:28])
indices_nlines = _expect_int(line[28:42])
values_nlines = _expect_int(line[42:56])
rhs_nlines = line[56:72].strip()
if rhs_nlines == '':
rhs_nlines = 0
else:
rhs_nlines = _expect_int(rhs_nlines)
if not rhs_nlines == 0:
raise ValueError("Only files without right hand side supported for "
"now.")
# Third line
line = fid.readline().strip("\n")
if not len(line) >= 70:
raise ValueError("Expected at least 72 character for third line, got:\n"
"%s" % line)
mxtype_s = line[:3].upper()
if not len(mxtype_s) == 3:
raise ValueError("mxtype expected to be 3 characters long")
mxtype = HBMatrixType.from_fortran(mxtype_s)
if mxtype.value_type not in ["real", "integer"]:
raise ValueError("Only real or integer matrices supported for "
"now (detected %s)" % mxtype)
if not mxtype.structure == "unsymmetric":
raise ValueError("Only unsymmetric matrices supported for "
"now (detected %s)" % mxtype)
if not mxtype.storage == "assembled":
raise ValueError("Only assembled matrices supported for now")
if not line[3:14] == " " * 11:
raise ValueError("Malformed data for third line: %s" % line)
nrows = _expect_int(line[14:28])
ncols = _expect_int(line[28:42])
nnon_zeros = _expect_int(line[42:56])
nelementals = _expect_int(line[56:70])
if not nelementals == 0:
raise ValueError("Unexpected value %d for nltvl (last entry of line 3)"
% nelementals)
# Fourth line
line = fid.readline().strip("\n")
ct = line.split()
if not len(ct) == 3:
raise ValueError("Expected 3 formats, got %s" % ct)
return cls(title, key,
total_nlines, pointer_nlines, indices_nlines, values_nlines,
mxtype, nrows, ncols, nnon_zeros,
ct[0], ct[1], ct[2],
rhs_nlines, nelementals)
def __init__(self, title, key,
total_nlines, pointer_nlines, indices_nlines, values_nlines,
mxtype, nrows, ncols, nnon_zeros,
pointer_format_str, indices_format_str, values_format_str,
right_hand_sides_nlines=0, nelementals=0):
"""Do not use this directly, but the class ctrs (from_* functions)."""
self.title = title
self.key = key
if title is None:
title = "No Title"
if len(title) > 72:
raise ValueError("title cannot be > 72 characters")
if key is None:
key = "|No Key"
if len(key) > 8:
warnings.warn("key is > 8 characters (key is %s)" % key,
LineOverflow, stacklevel=3)
self.total_nlines = total_nlines
self.pointer_nlines = pointer_nlines
self.indices_nlines = indices_nlines
self.values_nlines = values_nlines
parser = FortranFormatParser()
pointer_format = parser.parse(pointer_format_str)
if not isinstance(pointer_format, IntFormat):
raise ValueError("Expected int format for pointer format, got %s"
% pointer_format)
indices_format = parser.parse(indices_format_str)
if not isinstance(indices_format, IntFormat):
raise ValueError("Expected int format for indices format, got %s" %
indices_format)
values_format = parser.parse(values_format_str)
if isinstance(values_format, ExpFormat):
if mxtype.value_type not in ["real", "complex"]:
raise ValueError(f"Inconsistency between matrix type {mxtype} and "
f"value type {values_format}")
values_dtype = np.float64
elif isinstance(values_format, IntFormat):
if mxtype.value_type not in ["integer"]:
raise ValueError(f"Inconsistency between matrix type {mxtype} and "
f"value type {values_format}")
# XXX: fortran int -> dtype association ?
values_dtype = int
else:
raise ValueError(f"Unsupported format for values {values_format!r}")
self.pointer_format = pointer_format
self.indices_format = indices_format
self.values_format = values_format
self.pointer_dtype = np.int32
self.indices_dtype = np.int32
self.values_dtype = values_dtype
self.pointer_nlines = pointer_nlines
self.pointer_nbytes_full = _nbytes_full(pointer_format, pointer_nlines)
self.indices_nlines = indices_nlines
self.indices_nbytes_full = _nbytes_full(indices_format, indices_nlines)
self.values_nlines = values_nlines
self.values_nbytes_full = _nbytes_full(values_format, values_nlines)
self.nrows = nrows
self.ncols = ncols
self.nnon_zeros = nnon_zeros
self.nelementals = nelementals
self.mxtype = mxtype
def dump(self):
"""Gives the header corresponding to this instance as a string."""
header = [self.title.ljust(72) + self.key.ljust(8)]
header.append("%14d%14d%14d%14d" %
(self.total_nlines, self.pointer_nlines,
self.indices_nlines, self.values_nlines))
header.append("%14s%14d%14d%14d%14d" %
(self.mxtype.fortran_format.ljust(14), self.nrows,
self.ncols, self.nnon_zeros, 0))
pffmt = self.pointer_format.fortran_format
iffmt = self.indices_format.fortran_format
vffmt = self.values_format.fortran_format
header.append("%16s%16s%20s" %
(pffmt.ljust(16), iffmt.ljust(16), vffmt.ljust(20)))
return "\n".join(header)
def _expect_int(value, msg=None):
try:
return int(value)
except ValueError as e:
if msg is None:
msg = "Expected an int, got %s"
raise ValueError(msg % value) from e
def _read_hb_data(content, header):
# XXX: look at a way to reduce memory here (big string creation)
ptr_string = "".join([content.read(header.pointer_nbytes_full),
content.readline()])
ptr = np.fromstring(ptr_string,
dtype=int, sep=' ')
ind_string = "".join([content.read(header.indices_nbytes_full),
content.readline()])
ind = np.fromstring(ind_string,
dtype=int, sep=' ')
val_string = "".join([content.read(header.values_nbytes_full),
content.readline()])
val = np.fromstring(val_string,
dtype=header.values_dtype, sep=' ')
try:
return csc_matrix((val, ind-1, ptr-1),
shape=(header.nrows, header.ncols))
except ValueError as e:
raise e
def _write_data(m, fid, header):
m = m.tocsc(copy=False)
def write_array(f, ar, nlines, fmt):
# ar_nlines is the number of full lines, n is the number of items per
# line, ffmt the fortran format
pyfmt = fmt.python_format
pyfmt_full = pyfmt * fmt.repeat
# for each array to write, we first write the full lines, and special
# case for partial line
full = ar[:(nlines - 1) * fmt.repeat]
for row in full.reshape((nlines-1, fmt.repeat)):
f.write(pyfmt_full % tuple(row) + "\n")
nremain = ar.size - full.size
if nremain > 0:
f.write((pyfmt * nremain) % tuple(ar[ar.size - nremain:]) + "\n")
fid.write(header.dump())
fid.write("\n")
# +1 is for Fortran one-based indexing
write_array(fid, m.indptr+1, header.pointer_nlines,
header.pointer_format)
write_array(fid, m.indices+1, header.indices_nlines,
header.indices_format)
write_array(fid, m.data, header.values_nlines,
header.values_format)
class HBMatrixType:
"""Class to hold the matrix type."""
# q2f* translates qualified names to Fortran character
_q2f_type = {
"real": "R",
"complex": "C",
"pattern": "P",
"integer": "I",
}
_q2f_structure = {
"symmetric": "S",
"unsymmetric": "U",
"hermitian": "H",
"skewsymmetric": "Z",
"rectangular": "R"
}
_q2f_storage = {
"assembled": "A",
"elemental": "E",
}
_f2q_type = {j: i for i, j in _q2f_type.items()}
_f2q_structure = {j: i for i, j in _q2f_structure.items()}
_f2q_storage = {j: i for i, j in _q2f_storage.items()}
@classmethod
def from_fortran(cls, fmt):
if not len(fmt) == 3:
raise ValueError("Fortran format for matrix type should be 3 "
"characters long")
try:
value_type = cls._f2q_type[fmt[0]]
structure = cls._f2q_structure[fmt[1]]
storage = cls._f2q_storage[fmt[2]]
return cls(value_type, structure, storage)
except KeyError as e:
raise ValueError("Unrecognized format %s" % fmt) from e
def __init__(self, value_type, structure, storage="assembled"):
self.value_type = value_type
self.structure = structure
self.storage = storage
if value_type not in self._q2f_type:
raise ValueError("Unrecognized type %s" % value_type)
if structure not in self._q2f_structure:
raise ValueError("Unrecognized structure %s" % structure)
if storage not in self._q2f_storage:
raise ValueError("Unrecognized storage %s" % storage)
@property
def fortran_format(self):
return self._q2f_type[self.value_type] + \
self._q2f_structure[self.structure] + \
self._q2f_storage[self.storage]
def __repr__(self):
return f"HBMatrixType({self.value_type}, {self.structure}, {self.storage})"
class HBFile:
def __init__(self, file, hb_info=None):
"""Create a HBFile instance.
Parameters
----------
file : file-object
StringIO work as well
hb_info : HBInfo, optional
Should be given as an argument for writing, in which case the file
should be writable.
"""
self._fid = file
if hb_info is None:
self._hb_info = HBInfo.from_file(file)
else:
#raise OSError("file %s is not writable, and hb_info "
# "was given." % file)
self._hb_info = hb_info
@property
def title(self):
return self._hb_info.title
@property
def key(self):
return self._hb_info.key
@property
def type(self):
return self._hb_info.mxtype.value_type
@property
def structure(self):
return self._hb_info.mxtype.structure
@property
def storage(self):
return self._hb_info.mxtype.storage
def read_matrix(self):
return _read_hb_data(self._fid, self._hb_info)
def write_matrix(self, m):
return _write_data(m, self._fid, self._hb_info)
def hb_read(path_or_open_file):
"""Read HB-format file.
Parameters
----------
path_or_open_file : path-like or file-like
If a file-like object, it is used as-is. Otherwise, it is opened
before reading.
Returns
-------
data : scipy.sparse.csc_matrix instance
The data read from the HB file as a sparse matrix.
Notes
-----
At the moment not the full Harwell-Boeing format is supported. Supported
features are:
- assembled, non-symmetric, real matrices
- integer for pointer/indices
- exponential format for float values, and int format
Examples
--------
We can read and write a harwell-boeing format file:
>>> from scipy.io import hb_read, hb_write
>>> from scipy.sparse import csr_array, eye
>>> data = csr_array(eye(3)) # create a sparse array
>>> hb_write("data.hb", data) # write a hb file
>>> print(hb_read("data.hb")) # read a hb file
<Compressed Sparse Column sparse matrix of dtype 'float64'
with 3 stored elements and shape (3, 3)>
Coords Values
(0, 0) 1.0
(1, 1) 1.0
(2, 2) 1.0
"""
def _get_matrix(fid):
hb = HBFile(fid)
return hb.read_matrix()
if hasattr(path_or_open_file, 'read'):
return _get_matrix(path_or_open_file)
else:
with open(path_or_open_file) as f:
return _get_matrix(f)
def hb_write(path_or_open_file, m, hb_info=None):
"""Write HB-format file.
Parameters
----------
path_or_open_file : path-like or file-like
If a file-like object, it is used as-is. Otherwise, it is opened
before writing.
m : sparse-matrix
the sparse matrix to write
hb_info : HBInfo
contains the meta-data for write
Returns
-------
None
Notes
-----
At the moment not the full Harwell-Boeing format is supported. Supported
features are:
- assembled, non-symmetric, real matrices
- integer for pointer/indices
- exponential format for float values, and int format
Examples
--------
We can read and write a harwell-boeing format file:
>>> from scipy.io import hb_read, hb_write
>>> from scipy.sparse import csr_array, eye
>>> data = csr_array(eye(3)) # create a sparse array
>>> hb_write("data.hb", data) # write a hb file
>>> print(hb_read("data.hb")) # read a hb file
<Compressed Sparse Column sparse matrix of dtype 'float64'
with 3 stored elements and shape (3, 3)>
Coords Values
(0, 0) 1.0
(1, 1) 1.0
(2, 2) 1.0
"""
m = m.tocsc(copy=False)
if hb_info is None:
hb_info = HBInfo.from_data(m)
def _set_matrix(fid):
hb = HBFile(fid, hb_info)
return hb.write_matrix(m)
if hasattr(path_or_open_file, 'write'):
return _set_matrix(path_or_open_file)
else:
with open(path_or_open_file, 'w') as f:
return _set_matrix(f)

View File

@ -0,0 +1,74 @@
import numpy as np
from numpy.testing import assert_equal
from pytest import raises as assert_raises
from scipy.io._harwell_boeing._fortran_format_parser import (
FortranFormatParser, IntFormat, ExpFormat, BadFortranFormat)
class TestFortranFormatParser:
def setup_method(self):
self.parser = FortranFormatParser()
def _test_equal(self, format, ref):
ret = self.parser.parse(format)
assert_equal(ret.__dict__, ref.__dict__)
def test_simple_int(self):
self._test_equal("(I4)", IntFormat(4))
def test_simple_repeated_int(self):
self._test_equal("(3I4)", IntFormat(4, repeat=3))
def test_simple_exp(self):
self._test_equal("(E4.3)", ExpFormat(4, 3))
def test_exp_exp(self):
self._test_equal("(E8.3E3)", ExpFormat(8, 3, 3))
def test_repeat_exp(self):
self._test_equal("(2E4.3)", ExpFormat(4, 3, repeat=2))
def test_repeat_exp_exp(self):
self._test_equal("(2E8.3E3)", ExpFormat(8, 3, 3, repeat=2))
def test_wrong_formats(self):
def _test_invalid(bad_format):
assert_raises(BadFortranFormat, lambda: self.parser.parse(bad_format))
_test_invalid("I4")
_test_invalid("(E4)")
_test_invalid("(E4.)")
_test_invalid("(E4.E3)")
class TestIntFormat:
def test_to_fortran(self):
f = [IntFormat(10), IntFormat(12, 10), IntFormat(12, 10, 3)]
res = ["(I10)", "(I12.10)", "(3I12.10)"]
for i, j in zip(f, res):
assert_equal(i.fortran_format, j)
def test_from_number(self):
f = [10, -12, 123456789]
r_f = [IntFormat(3, repeat=26), IntFormat(4, repeat=20),
IntFormat(10, repeat=8)]
for i, j in zip(f, r_f):
assert_equal(IntFormat.from_number(i).__dict__, j.__dict__)
class TestExpFormat:
def test_to_fortran(self):
f = [ExpFormat(10, 5), ExpFormat(12, 10), ExpFormat(12, 10, min=3),
ExpFormat(10, 5, repeat=3)]
res = ["(E10.5)", "(E12.10)", "(E12.10E3)", "(3E10.5)"]
for i, j in zip(f, res):
assert_equal(i.fortran_format, j)
def test_from_number(self):
f = np.array([1.0, -1.2])
r_f = [ExpFormat(24, 16, repeat=3), ExpFormat(25, 16, repeat=3)]
for i, j in zip(f, r_f):
assert_equal(ExpFormat.from_number(i).__dict__, j.__dict__)

View File

@ -0,0 +1,65 @@
from io import StringIO
import tempfile
import numpy as np
from numpy.testing import assert_equal, \
assert_array_almost_equal_nulp
from scipy.sparse import coo_matrix, csc_matrix, rand
from scipy.io import hb_read, hb_write
SIMPLE = """\
No Title |No Key
9 4 1 4
RUA 100 100 10 0
(26I3) (26I3) (3E23.15)
1 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
3 3 3 3 3 3 3 4 4 4 6 6 6 6 6 6 6 6 6 6 6 8 9 9 9 9
9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 11
37 71 89 18 30 45 70 19 25 52
2.971243799687726e-01 3.662366682877375e-01 4.786962174699534e-01
6.490068647991184e-01 6.617490424831662e-02 8.870370343191623e-01
4.196478590163001e-01 5.649603072111251e-01 9.934423887087086e-01
6.912334991524289e-01
"""
SIMPLE_MATRIX = coo_matrix(
((0.297124379969, 0.366236668288, 0.47869621747, 0.649006864799,
0.0661749042483, 0.887037034319, 0.419647859016,
0.564960307211, 0.993442388709, 0.691233499152,),
(np.array([[36, 70, 88, 17, 29, 44, 69, 18, 24, 51],
[0, 4, 58, 61, 61, 72, 72, 73, 99, 99]]))))
def assert_csc_almost_equal(r, l):
r = csc_matrix(r)
l = csc_matrix(l)
assert_equal(r.indptr, l.indptr)
assert_equal(r.indices, l.indices)
assert_array_almost_equal_nulp(r.data, l.data, 10000)
class TestHBReader:
def test_simple(self):
m = hb_read(StringIO(SIMPLE))
assert_csc_almost_equal(m, SIMPLE_MATRIX)
class TestHBReadWrite:
def check_save_load(self, value):
with tempfile.NamedTemporaryFile(mode='w+t') as file:
hb_write(file, value)
file.file.seek(0)
value_loaded = hb_read(file)
assert_csc_almost_equal(value, value_loaded)
def test_simple(self):
random_matrix = rand(10, 100, 0.1)
for matrix_format in ('coo', 'csc', 'csr', 'bsr', 'dia', 'dok', 'lil'):
matrix = random_matrix.asformat(matrix_format, copy=False)
self.check_save_load(matrix)

View File

@ -0,0 +1,918 @@
# IDLSave - a python module to read IDL 'save' files
# Copyright (c) 2010 Thomas P. Robitaille
# Many thanks to Craig Markwardt for publishing the Unofficial Format
# Specification for IDL .sav files, without which this Python module would not
# exist (http://cow.physics.wisc.edu/~craigm/idl/savefmt).
# This code was developed by with permission from ITT Visual Information
# Systems. IDL(r) is a registered trademark of ITT Visual Information Systems,
# Inc. for their Interactive Data Language software.
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
__all__ = ['readsav']
import struct
import numpy as np
import tempfile
import zlib
import warnings
# Define the different data types that can be found in an IDL save file
DTYPE_DICT = {1: '>u1',
2: '>i2',
3: '>i4',
4: '>f4',
5: '>f8',
6: '>c8',
7: '|O',
8: '|O',
9: '>c16',
10: '|O',
11: '|O',
12: '>u2',
13: '>u4',
14: '>i8',
15: '>u8'}
# Define the different record types that can be found in an IDL save file
RECTYPE_DICT = {0: "START_MARKER",
1: "COMMON_VARIABLE",
2: "VARIABLE",
3: "SYSTEM_VARIABLE",
6: "END_MARKER",
10: "TIMESTAMP",
12: "COMPILED",
13: "IDENTIFICATION",
14: "VERSION",
15: "HEAP_HEADER",
16: "HEAP_DATA",
17: "PROMOTE64",
19: "NOTICE",
20: "DESCRIPTION"}
# Define a dictionary to contain structure definitions
STRUCT_DICT = {}
def _align_32(f):
'''Align to the next 32-bit position in a file'''
pos = f.tell()
if pos % 4 != 0:
f.seek(pos + 4 - pos % 4)
return
def _skip_bytes(f, n):
'''Skip `n` bytes'''
f.read(n)
return
def _read_bytes(f, n):
'''Read the next `n` bytes'''
return f.read(n)
def _read_byte(f):
'''Read a single byte'''
return np.uint8(struct.unpack('>B', f.read(4)[:1])[0])
def _read_long(f):
'''Read a signed 32-bit integer'''
return np.int32(struct.unpack('>l', f.read(4))[0])
def _read_int16(f):
'''Read a signed 16-bit integer'''
return np.int16(struct.unpack('>h', f.read(4)[2:4])[0])
def _read_int32(f):
'''Read a signed 32-bit integer'''
return np.int32(struct.unpack('>i', f.read(4))[0])
def _read_int64(f):
'''Read a signed 64-bit integer'''
return np.int64(struct.unpack('>q', f.read(8))[0])
def _read_uint16(f):
'''Read an unsigned 16-bit integer'''
return np.uint16(struct.unpack('>H', f.read(4)[2:4])[0])
def _read_uint32(f):
'''Read an unsigned 32-bit integer'''
return np.uint32(struct.unpack('>I', f.read(4))[0])
def _read_uint64(f):
'''Read an unsigned 64-bit integer'''
return np.uint64(struct.unpack('>Q', f.read(8))[0])
def _read_float32(f):
'''Read a 32-bit float'''
return np.float32(struct.unpack('>f', f.read(4))[0])
def _read_float64(f):
'''Read a 64-bit float'''
return np.float64(struct.unpack('>d', f.read(8))[0])
class Pointer:
'''Class used to define pointers'''
def __init__(self, index):
self.index = index
return
class ObjectPointer(Pointer):
'''Class used to define object pointers'''
pass
def _read_string(f):
'''Read a string'''
length = _read_long(f)
if length > 0:
chars = _read_bytes(f, length).decode('latin1')
_align_32(f)
else:
chars = ''
return chars
def _read_string_data(f):
'''Read a data string (length is specified twice)'''
length = _read_long(f)
if length > 0:
length = _read_long(f)
string_data = _read_bytes(f, length)
_align_32(f)
else:
string_data = ''
return string_data
def _read_data(f, dtype):
'''Read a variable with a specified data type'''
if dtype == 1:
if _read_int32(f) != 1:
raise Exception("Error occurred while reading byte variable")
return _read_byte(f)
elif dtype == 2:
return _read_int16(f)
elif dtype == 3:
return _read_int32(f)
elif dtype == 4:
return _read_float32(f)
elif dtype == 5:
return _read_float64(f)
elif dtype == 6:
real = _read_float32(f)
imag = _read_float32(f)
return np.complex64(real + imag * 1j)
elif dtype == 7:
return _read_string_data(f)
elif dtype == 8:
raise Exception("Should not be here - please report this")
elif dtype == 9:
real = _read_float64(f)
imag = _read_float64(f)
return np.complex128(real + imag * 1j)
elif dtype == 10:
return Pointer(_read_int32(f))
elif dtype == 11:
return ObjectPointer(_read_int32(f))
elif dtype == 12:
return _read_uint16(f)
elif dtype == 13:
return _read_uint32(f)
elif dtype == 14:
return _read_int64(f)
elif dtype == 15:
return _read_uint64(f)
else:
raise Exception("Unknown IDL type: %i - please report this" % dtype)
def _read_structure(f, array_desc, struct_desc):
'''
Read a structure, with the array and structure descriptors given as
`array_desc` and `structure_desc` respectively.
'''
nrows = array_desc['nelements']
columns = struct_desc['tagtable']
dtype = []
for col in columns:
if col['structure'] or col['array']:
dtype.append(((col['name'].lower(), col['name']), np.object_))
else:
if col['typecode'] in DTYPE_DICT:
dtype.append(((col['name'].lower(), col['name']),
DTYPE_DICT[col['typecode']]))
else:
raise Exception("Variable type %i not implemented" %
col['typecode'])
structure = np.rec.recarray((nrows, ), dtype=dtype)
for i in range(nrows):
for col in columns:
dtype = col['typecode']
if col['structure']:
structure[col['name']][i] = _read_structure(f,
struct_desc['arrtable'][col['name']],
struct_desc['structtable'][col['name']])
elif col['array']:
structure[col['name']][i] = _read_array(f, dtype,
struct_desc['arrtable'][col['name']])
else:
structure[col['name']][i] = _read_data(f, dtype)
# Reshape structure if needed
if array_desc['ndims'] > 1:
dims = array_desc['dims'][:int(array_desc['ndims'])]
dims.reverse()
structure = structure.reshape(dims)
return structure
def _read_array(f, typecode, array_desc):
'''
Read an array of type `typecode`, with the array descriptor given as
`array_desc`.
'''
if typecode in [1, 3, 4, 5, 6, 9, 13, 14, 15]:
if typecode == 1:
nbytes = _read_int32(f)
if nbytes != array_desc['nbytes']:
warnings.warn("Not able to verify number of bytes from header",
stacklevel=3)
# Read bytes as numpy array
array = np.frombuffer(f.read(array_desc['nbytes']),
dtype=DTYPE_DICT[typecode])
elif typecode in [2, 12]:
# These are 2 byte types, need to skip every two as they are not packed
array = np.frombuffer(f.read(array_desc['nbytes']*2),
dtype=DTYPE_DICT[typecode])[1::2]
else:
# Read bytes into list
array = []
for i in range(array_desc['nelements']):
dtype = typecode
data = _read_data(f, dtype)
array.append(data)
array = np.array(array, dtype=np.object_)
# Reshape array if needed
if array_desc['ndims'] > 1:
dims = array_desc['dims'][:int(array_desc['ndims'])]
dims.reverse()
array = array.reshape(dims)
# Go to next alignment position
_align_32(f)
return array
def _read_record(f):
'''Function to read in a full record'''
record = {'rectype': _read_long(f)}
nextrec = _read_uint32(f)
nextrec += _read_uint32(f).astype(np.int64) * 2**32
_skip_bytes(f, 4)
if record['rectype'] not in RECTYPE_DICT:
raise Exception("Unknown RECTYPE: %i" % record['rectype'])
record['rectype'] = RECTYPE_DICT[record['rectype']]
if record['rectype'] in ["VARIABLE", "HEAP_DATA"]:
if record['rectype'] == "VARIABLE":
record['varname'] = _read_string(f)
else:
record['heap_index'] = _read_long(f)
_skip_bytes(f, 4)
rectypedesc = _read_typedesc(f)
if rectypedesc['typecode'] == 0:
if nextrec == f.tell():
record['data'] = None # Indicates NULL value
else:
raise ValueError("Unexpected type code: 0")
else:
varstart = _read_long(f)
if varstart != 7:
raise Exception("VARSTART is not 7")
if rectypedesc['structure']:
record['data'] = _read_structure(f, rectypedesc['array_desc'],
rectypedesc['struct_desc'])
elif rectypedesc['array']:
record['data'] = _read_array(f, rectypedesc['typecode'],
rectypedesc['array_desc'])
else:
dtype = rectypedesc['typecode']
record['data'] = _read_data(f, dtype)
elif record['rectype'] == "TIMESTAMP":
_skip_bytes(f, 4*256)
record['date'] = _read_string(f)
record['user'] = _read_string(f)
record['host'] = _read_string(f)
elif record['rectype'] == "VERSION":
record['format'] = _read_long(f)
record['arch'] = _read_string(f)
record['os'] = _read_string(f)
record['release'] = _read_string(f)
elif record['rectype'] == "IDENTIFICATON":
record['author'] = _read_string(f)
record['title'] = _read_string(f)
record['idcode'] = _read_string(f)
elif record['rectype'] == "NOTICE":
record['notice'] = _read_string(f)
elif record['rectype'] == "DESCRIPTION":
record['description'] = _read_string_data(f)
elif record['rectype'] == "HEAP_HEADER":
record['nvalues'] = _read_long(f)
record['indices'] = [_read_long(f) for _ in range(record['nvalues'])]
elif record['rectype'] == "COMMONBLOCK":
record['nvars'] = _read_long(f)
record['name'] = _read_string(f)
record['varnames'] = [_read_string(f) for _ in range(record['nvars'])]
elif record['rectype'] == "END_MARKER":
record['end'] = True
elif record['rectype'] == "UNKNOWN":
warnings.warn("Skipping UNKNOWN record", stacklevel=3)
elif record['rectype'] == "SYSTEM_VARIABLE":
warnings.warn("Skipping SYSTEM_VARIABLE record", stacklevel=3)
else:
raise Exception(f"record['rectype']={record['rectype']} not implemented")
f.seek(nextrec)
return record
def _read_typedesc(f):
'''Function to read in a type descriptor'''
typedesc = {'typecode': _read_long(f), 'varflags': _read_long(f)}
if typedesc['varflags'] & 2 == 2:
raise Exception("System variables not implemented")
typedesc['array'] = typedesc['varflags'] & 4 == 4
typedesc['structure'] = typedesc['varflags'] & 32 == 32
if typedesc['structure']:
typedesc['array_desc'] = _read_arraydesc(f)
typedesc['struct_desc'] = _read_structdesc(f)
elif typedesc['array']:
typedesc['array_desc'] = _read_arraydesc(f)
return typedesc
def _read_arraydesc(f):
'''Function to read in an array descriptor'''
arraydesc = {'arrstart': _read_long(f)}
if arraydesc['arrstart'] == 8:
_skip_bytes(f, 4)
arraydesc['nbytes'] = _read_long(f)
arraydesc['nelements'] = _read_long(f)
arraydesc['ndims'] = _read_long(f)
_skip_bytes(f, 8)
arraydesc['nmax'] = _read_long(f)
arraydesc['dims'] = [_read_long(f) for _ in range(arraydesc['nmax'])]
elif arraydesc['arrstart'] == 18:
warnings.warn("Using experimental 64-bit array read", stacklevel=3)
_skip_bytes(f, 8)
arraydesc['nbytes'] = _read_uint64(f)
arraydesc['nelements'] = _read_uint64(f)
arraydesc['ndims'] = _read_long(f)
_skip_bytes(f, 8)
arraydesc['nmax'] = 8
arraydesc['dims'] = []
for d in range(arraydesc['nmax']):
v = _read_long(f)
if v != 0:
raise Exception("Expected a zero in ARRAY_DESC")
arraydesc['dims'].append(_read_long(f))
else:
raise Exception("Unknown ARRSTART: %i" % arraydesc['arrstart'])
return arraydesc
def _read_structdesc(f):
'''Function to read in a structure descriptor'''
structdesc = {}
structstart = _read_long(f)
if structstart != 9:
raise Exception("STRUCTSTART should be 9")
structdesc['name'] = _read_string(f)
predef = _read_long(f)
structdesc['ntags'] = _read_long(f)
structdesc['nbytes'] = _read_long(f)
structdesc['predef'] = predef & 1
structdesc['inherits'] = predef & 2
structdesc['is_super'] = predef & 4
if not structdesc['predef']:
structdesc['tagtable'] = [_read_tagdesc(f)
for _ in range(structdesc['ntags'])]
for tag in structdesc['tagtable']:
tag['name'] = _read_string(f)
structdesc['arrtable'] = {tag['name']: _read_arraydesc(f)
for tag in structdesc['tagtable']
if tag['array']}
structdesc['structtable'] = {tag['name']: _read_structdesc(f)
for tag in structdesc['tagtable']
if tag['structure']}
if structdesc['inherits'] or structdesc['is_super']:
structdesc['classname'] = _read_string(f)
structdesc['nsupclasses'] = _read_long(f)
structdesc['supclassnames'] = [
_read_string(f) for _ in range(structdesc['nsupclasses'])]
structdesc['supclasstable'] = [
_read_structdesc(f) for _ in range(structdesc['nsupclasses'])]
STRUCT_DICT[structdesc['name']] = structdesc
else:
if structdesc['name'] not in STRUCT_DICT:
raise Exception("PREDEF=1 but can't find definition")
structdesc = STRUCT_DICT[structdesc['name']]
return structdesc
def _read_tagdesc(f):
'''Function to read in a tag descriptor'''
tagdesc = {'offset': _read_long(f)}
if tagdesc['offset'] == -1:
tagdesc['offset'] = _read_uint64(f)
tagdesc['typecode'] = _read_long(f)
tagflags = _read_long(f)
tagdesc['array'] = tagflags & 4 == 4
tagdesc['structure'] = tagflags & 32 == 32
tagdesc['scalar'] = tagdesc['typecode'] in DTYPE_DICT
# Assume '10'x is scalar
return tagdesc
def _replace_heap(variable, heap):
if isinstance(variable, Pointer):
while isinstance(variable, Pointer):
if variable.index == 0:
variable = None
else:
if variable.index in heap:
variable = heap[variable.index]
else:
warnings.warn("Variable referenced by pointer not found "
"in heap: variable will be set to None",
stacklevel=3)
variable = None
replace, new = _replace_heap(variable, heap)
if replace:
variable = new
return True, variable
elif isinstance(variable, np.rec.recarray):
# Loop over records
for ir, record in enumerate(variable):
replace, new = _replace_heap(record, heap)
if replace:
variable[ir] = new
return False, variable
elif isinstance(variable, np.record):
# Loop over values
for iv, value in enumerate(variable):
replace, new = _replace_heap(value, heap)
if replace:
variable[iv] = new
return False, variable
elif isinstance(variable, np.ndarray):
# Loop over values if type is np.object_
if variable.dtype.type is np.object_:
for iv in range(variable.size):
replace, new = _replace_heap(variable.item(iv), heap)
if replace:
variable.reshape(-1)[iv] = new
return False, variable
else:
return False, variable
class AttrDict(dict):
'''
A case-insensitive dictionary with access via item, attribute, and call
notations:
>>> from scipy.io._idl import AttrDict
>>> d = AttrDict()
>>> d['Variable'] = 123
>>> d['Variable']
123
>>> d.Variable
123
>>> d.variable
123
>>> d('VARIABLE')
123
>>> d['missing']
Traceback (most recent error last):
...
KeyError: 'missing'
>>> d.missing
Traceback (most recent error last):
...
AttributeError: 'AttrDict' object has no attribute 'missing'
'''
def __init__(self, init={}):
dict.__init__(self, init)
def __getitem__(self, name):
return super().__getitem__(name.lower())
def __setitem__(self, key, value):
return super().__setitem__(key.lower(), value)
def __getattr__(self, name):
try:
return self.__getitem__(name)
except KeyError:
raise AttributeError(
f"'{type(self)}' object has no attribute '{name}'") from None
__setattr__ = __setitem__
__call__ = __getitem__
def readsav(file_name, idict=None, python_dict=False,
uncompressed_file_name=None, verbose=False):
"""
Read an IDL .sav file.
Parameters
----------
file_name : str
Name of the IDL save file.
idict : dict, optional
Dictionary in which to insert .sav file variables.
python_dict : bool, optional
By default, the object return is not a Python dictionary, but a
case-insensitive dictionary with item, attribute, and call access
to variables. To get a standard Python dictionary, set this option
to True.
uncompressed_file_name : str, optional
This option only has an effect for .sav files written with the
/compress option. If a file name is specified, compressed .sav
files are uncompressed to this file. Otherwise, readsav will use
the `tempfile` module to determine a temporary filename
automatically, and will remove the temporary file upon successfully
reading it in.
verbose : bool, optional
Whether to print out information about the save file, including
the records read, and available variables.
Returns
-------
idl_dict : AttrDict or dict
If `python_dict` is set to False (default), this function returns a
case-insensitive dictionary with item, attribute, and call access
to variables. If `python_dict` is set to True, this function
returns a Python dictionary with all variable names in lowercase.
If `idict` was specified, then variables are written to the
dictionary specified, and the updated dictionary is returned.
Examples
--------
>>> from os.path import dirname, join as pjoin
>>> import scipy.io as sio
>>> from scipy.io import readsav
Get the filename for an example .sav file from the tests/data directory.
>>> data_dir = pjoin(dirname(sio.__file__), 'tests', 'data')
>>> sav_fname = pjoin(data_dir, 'array_float32_1d.sav')
Load the .sav file contents.
>>> sav_data = readsav(sav_fname)
Get keys of the .sav file contents.
>>> print(sav_data.keys())
dict_keys(['array1d'])
Access a content with a key.
>>> print(sav_data['array1d'])
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0.]
"""
# Initialize record and variable holders
records = []
if python_dict or idict:
variables = {}
else:
variables = AttrDict()
# Open the IDL file
f = open(file_name, 'rb')
# Read the signature, which should be 'SR'
signature = _read_bytes(f, 2)
if signature != b'SR':
raise Exception("Invalid SIGNATURE: %s" % signature)
# Next, the record format, which is '\x00\x04' for normal .sav
# files, and '\x00\x06' for compressed .sav files.
recfmt = _read_bytes(f, 2)
if recfmt == b'\x00\x04':
pass
elif recfmt == b'\x00\x06':
if verbose:
print("IDL Save file is compressed")
if uncompressed_file_name:
fout = open(uncompressed_file_name, 'w+b')
else:
fout = tempfile.NamedTemporaryFile(suffix='.sav')
if verbose:
print(" -> expanding to %s" % fout.name)
# Write header
fout.write(b'SR\x00\x04')
# Cycle through records
while True:
# Read record type
rectype = _read_long(f)
fout.write(struct.pack('>l', int(rectype)))
# Read position of next record and return as int
nextrec = _read_uint32(f)
nextrec += _read_uint32(f).astype(np.int64) * 2**32
# Read the unknown 4 bytes
unknown = f.read(4)
# Check if the end of the file has been reached
if RECTYPE_DICT[rectype] == 'END_MARKER':
modval = np.int64(2**32)
fout.write(struct.pack('>I', int(nextrec) % modval))
fout.write(
struct.pack('>I', int((nextrec - (nextrec % modval)) / modval))
)
fout.write(unknown)
break
# Find current position
pos = f.tell()
# Decompress record
rec_string = zlib.decompress(f.read(nextrec-pos))
# Find new position of next record
nextrec = fout.tell() + len(rec_string) + 12
# Write out record
fout.write(struct.pack('>I', int(nextrec % 2**32)))
fout.write(struct.pack('>I', int((nextrec - (nextrec % 2**32)) / 2**32)))
fout.write(unknown)
fout.write(rec_string)
# Close the original compressed file
f.close()
# Set f to be the decompressed file, and skip the first four bytes
f = fout
f.seek(4)
else:
raise Exception("Invalid RECFMT: %s" % recfmt)
# Loop through records, and add them to the list
while True:
r = _read_record(f)
records.append(r)
if 'end' in r:
if r['end']:
break
# Close the file
f.close()
# Find heap data variables
heap = {}
for r in records:
if r['rectype'] == "HEAP_DATA":
heap[r['heap_index']] = r['data']
# Find all variables
for r in records:
if r['rectype'] == "VARIABLE":
replace, new = _replace_heap(r['data'], heap)
if replace:
r['data'] = new
variables[r['varname'].lower()] = r['data']
if verbose:
# Print out timestamp info about the file
for record in records:
if record['rectype'] == "TIMESTAMP":
print("-"*50)
print("Date: %s" % record['date'])
print("User: %s" % record['user'])
print("Host: %s" % record['host'])
break
# Print out version info about the file
for record in records:
if record['rectype'] == "VERSION":
print("-"*50)
print("Format: %s" % record['format'])
print("Architecture: %s" % record['arch'])
print("Operating System: %s" % record['os'])
print("IDL Version: %s" % record['release'])
break
# Print out identification info about the file
for record in records:
if record['rectype'] == "IDENTIFICATON":
print("-"*50)
print("Author: %s" % record['author'])
print("Title: %s" % record['title'])
print("ID Code: %s" % record['idcode'])
break
# Print out descriptions saved with the file
for record in records:
if record['rectype'] == "DESCRIPTION":
print("-"*50)
print("Description: %s" % record['description'])
break
print("-"*50)
print("Successfully read %i records of which:" %
(len(records)))
# Create convenience list of record types
rectypes = [r['rectype'] for r in records]
for rt in set(rectypes):
if rt != 'END_MARKER':
print(" - %i are of type %s" % (rectypes.count(rt), rt))
print("-"*50)
if 'VARIABLE' in rectypes:
print("Available variables:")
for var in variables:
print(f" - {var} [{type(variables[var])}]")
print("-"*50)
if idict:
for var in variables:
idict[var] = variables[var]
return idict
else:
return variables

View File

@ -0,0 +1,961 @@
"""
Matrix Market I/O in Python.
See http://math.nist.gov/MatrixMarket/formats.html
for information about the Matrix Market format.
"""
#
# Author: Pearu Peterson <pearu@cens.ioc.ee>
# Created: October, 2004
#
# References:
# http://math.nist.gov/MatrixMarket/
#
import os
import numpy as np
from numpy import (asarray, real, imag, conj, zeros, ndarray, concatenate,
ones, can_cast)
from scipy.sparse import coo_matrix, issparse
__all__ = ['mminfo', 'mmread', 'mmwrite', 'MMFile']
# -----------------------------------------------------------------------------
def asstr(s):
if isinstance(s, bytes):
return s.decode('latin1')
return str(s)
def mminfo(source):
"""
Return size and storage parameters from Matrix Market file-like 'source'.
Parameters
----------
source : str or file-like
Matrix Market filename (extension .mtx) or open file-like object
Returns
-------
rows : int
Number of matrix rows.
cols : int
Number of matrix columns.
entries : int
Number of non-zero entries of a sparse matrix
or rows*cols for a dense matrix.
format : str
Either 'coordinate' or 'array'.
field : str
Either 'real', 'complex', 'pattern', or 'integer'.
symmetry : str
Either 'general', 'symmetric', 'skew-symmetric', or 'hermitian'.
Examples
--------
>>> from io import StringIO
>>> from scipy.io import mminfo
>>> text = '''%%MatrixMarket matrix coordinate real general
... 5 5 7
... 2 3 1.0
... 3 4 2.0
... 3 5 3.0
... 4 1 4.0
... 4 2 5.0
... 4 3 6.0
... 4 4 7.0
... '''
``mminfo(source)`` returns the number of rows, number of columns,
format, field type and symmetry attribute of the source file.
>>> mminfo(StringIO(text))
(5, 5, 7, 'coordinate', 'real', 'general')
"""
return MMFile.info(source)
# -----------------------------------------------------------------------------
def mmread(source):
"""
Reads the contents of a Matrix Market file-like 'source' into a matrix.
Parameters
----------
source : str or file-like
Matrix Market filename (extensions .mtx, .mtz.gz)
or open file-like object.
Returns
-------
a : ndarray or coo_matrix
Dense or sparse matrix depending on the matrix format in the
Matrix Market file.
Examples
--------
>>> from io import StringIO
>>> from scipy.io import mmread
>>> text = '''%%MatrixMarket matrix coordinate real general
... 5 5 7
... 2 3 1.0
... 3 4 2.0
... 3 5 3.0
... 4 1 4.0
... 4 2 5.0
... 4 3 6.0
... 4 4 7.0
... '''
``mmread(source)`` returns the data as sparse matrix in COO format.
>>> m = mmread(StringIO(text))
>>> m
<5x5 sparse matrix of type '<class 'numpy.float64'>'
with 7 stored elements in COOrdinate format>
>>> m.A
array([[0., 0., 0., 0., 0.],
[0., 0., 1., 0., 0.],
[0., 0., 0., 2., 3.],
[4., 5., 6., 7., 0.],
[0., 0., 0., 0., 0.]])
"""
return MMFile().read(source)
# -----------------------------------------------------------------------------
def mmwrite(target, a, comment='', field=None, precision=None, symmetry=None):
r"""
Writes the sparse or dense array `a` to Matrix Market file-like `target`.
Parameters
----------
target : str or file-like
Matrix Market filename (extension .mtx) or open file-like object.
a : array like
Sparse or dense 2-D array.
comment : str, optional
Comments to be prepended to the Matrix Market file.
field : None or str, optional
Either 'real', 'complex', 'pattern', or 'integer'.
precision : None or int, optional
Number of digits to display for real or complex values.
symmetry : None or str, optional
Either 'general', 'symmetric', 'skew-symmetric', or 'hermitian'.
If symmetry is None the symmetry type of 'a' is determined by its
values.
Returns
-------
None
Examples
--------
>>> from io import BytesIO
>>> import numpy as np
>>> from scipy.sparse import coo_matrix
>>> from scipy.io import mmwrite
Write a small NumPy array to a matrix market file. The file will be
written in the ``'array'`` format.
>>> a = np.array([[1.0, 0, 0, 0], [0, 2.5, 0, 6.25]])
>>> target = BytesIO()
>>> mmwrite(target, a)
>>> print(target.getvalue().decode('latin1'))
%%MatrixMarket matrix array real general
%
2 4
1.0000000000000000e+00
0.0000000000000000e+00
0.0000000000000000e+00
2.5000000000000000e+00
0.0000000000000000e+00
0.0000000000000000e+00
0.0000000000000000e+00
6.2500000000000000e+00
Add a comment to the output file, and set the precision to 3.
>>> target = BytesIO()
>>> mmwrite(target, a, comment='\n Some test data.\n', precision=3)
>>> print(target.getvalue().decode('latin1'))
%%MatrixMarket matrix array real general
%
% Some test data.
%
2 4
1.000e+00
0.000e+00
0.000e+00
2.500e+00
0.000e+00
0.000e+00
0.000e+00
6.250e+00
Convert to a sparse matrix before calling ``mmwrite``. This will
result in the output format being ``'coordinate'`` rather than
``'array'``.
>>> target = BytesIO()
>>> mmwrite(target, coo_matrix(a), precision=3)
>>> print(target.getvalue().decode('latin1'))
%%MatrixMarket matrix coordinate real general
%
2 4 3
1 1 1.00e+00
2 2 2.50e+00
2 4 6.25e+00
Write a complex Hermitian array to a matrix market file. Note that
only six values are actually written to the file; the other values
are implied by the symmetry.
>>> z = np.array([[3, 1+2j, 4-3j], [1-2j, 1, -5j], [4+3j, 5j, 2.5]])
>>> z
array([[ 3. +0.j, 1. +2.j, 4. -3.j],
[ 1. -2.j, 1. +0.j, -0. -5.j],
[ 4. +3.j, 0. +5.j, 2.5+0.j]])
>>> target = BytesIO()
>>> mmwrite(target, z, precision=2)
>>> print(target.getvalue().decode('latin1'))
%%MatrixMarket matrix array complex hermitian
%
3 3
3.00e+00 0.00e+00
1.00e+00 -2.00e+00
4.00e+00 3.00e+00
1.00e+00 0.00e+00
0.00e+00 5.00e+00
2.50e+00 0.00e+00
"""
MMFile().write(target, a, comment, field, precision, symmetry)
###############################################################################
class MMFile:
__slots__ = ('_rows',
'_cols',
'_entries',
'_format',
'_field',
'_symmetry')
@property
def rows(self):
return self._rows
@property
def cols(self):
return self._cols
@property
def entries(self):
return self._entries
@property
def format(self):
return self._format
@property
def field(self):
return self._field
@property
def symmetry(self):
return self._symmetry
@property
def has_symmetry(self):
return self._symmetry in (self.SYMMETRY_SYMMETRIC,
self.SYMMETRY_SKEW_SYMMETRIC,
self.SYMMETRY_HERMITIAN)
# format values
FORMAT_COORDINATE = 'coordinate'
FORMAT_ARRAY = 'array'
FORMAT_VALUES = (FORMAT_COORDINATE, FORMAT_ARRAY)
@classmethod
def _validate_format(self, format):
if format not in self.FORMAT_VALUES:
msg = f'unknown format type {format}, must be one of {self.FORMAT_VALUES}'
raise ValueError(msg)
# field values
FIELD_INTEGER = 'integer'
FIELD_UNSIGNED = 'unsigned-integer'
FIELD_REAL = 'real'
FIELD_COMPLEX = 'complex'
FIELD_PATTERN = 'pattern'
FIELD_VALUES = (FIELD_INTEGER, FIELD_UNSIGNED, FIELD_REAL, FIELD_COMPLEX,
FIELD_PATTERN)
@classmethod
def _validate_field(self, field):
if field not in self.FIELD_VALUES:
msg = f'unknown field type {field}, must be one of {self.FIELD_VALUES}'
raise ValueError(msg)
# symmetry values
SYMMETRY_GENERAL = 'general'
SYMMETRY_SYMMETRIC = 'symmetric'
SYMMETRY_SKEW_SYMMETRIC = 'skew-symmetric'
SYMMETRY_HERMITIAN = 'hermitian'
SYMMETRY_VALUES = (SYMMETRY_GENERAL, SYMMETRY_SYMMETRIC,
SYMMETRY_SKEW_SYMMETRIC, SYMMETRY_HERMITIAN)
@classmethod
def _validate_symmetry(self, symmetry):
if symmetry not in self.SYMMETRY_VALUES:
raise ValueError(f'unknown symmetry type {symmetry}, '
f'must be one of {self.SYMMETRY_VALUES}')
DTYPES_BY_FIELD = {FIELD_INTEGER: 'intp',
FIELD_UNSIGNED: 'uint64',
FIELD_REAL: 'd',
FIELD_COMPLEX: 'D',
FIELD_PATTERN: 'd'}
# -------------------------------------------------------------------------
@staticmethod
def reader():
pass
# -------------------------------------------------------------------------
@staticmethod
def writer():
pass
# -------------------------------------------------------------------------
@classmethod
def info(self, source):
"""
Return size, storage parameters from Matrix Market file-like 'source'.
Parameters
----------
source : str or file-like
Matrix Market filename (extension .mtx) or open file-like object
Returns
-------
rows : int
Number of matrix rows.
cols : int
Number of matrix columns.
entries : int
Number of non-zero entries of a sparse matrix
or rows*cols for a dense matrix.
format : str
Either 'coordinate' or 'array'.
field : str
Either 'real', 'complex', 'pattern', or 'integer'.
symmetry : str
Either 'general', 'symmetric', 'skew-symmetric', or 'hermitian'.
"""
stream, close_it = self._open(source)
try:
# read and validate header line
line = stream.readline()
mmid, matrix, format, field, symmetry = \
(asstr(part.strip()) for part in line.split())
if not mmid.startswith('%%MatrixMarket'):
raise ValueError('source is not in Matrix Market format')
if not matrix.lower() == 'matrix':
raise ValueError("Problem reading file header: " + line)
# http://math.nist.gov/MatrixMarket/formats.html
if format.lower() == 'array':
format = self.FORMAT_ARRAY
elif format.lower() == 'coordinate':
format = self.FORMAT_COORDINATE
# skip comments
# line.startswith('%')
while line:
if line.lstrip() and line.lstrip()[0] in ['%', 37]:
line = stream.readline()
else:
break
# skip empty lines
while not line.strip():
line = stream.readline()
split_line = line.split()
if format == self.FORMAT_ARRAY:
if not len(split_line) == 2:
raise ValueError("Header line not of length 2: " +
line.decode('ascii'))
rows, cols = map(int, split_line)
entries = rows * cols
else:
if not len(split_line) == 3:
raise ValueError("Header line not of length 3: " +
line.decode('ascii'))
rows, cols, entries = map(int, split_line)
return (rows, cols, entries, format, field.lower(),
symmetry.lower())
finally:
if close_it:
stream.close()
# -------------------------------------------------------------------------
@staticmethod
def _open(filespec, mode='rb'):
""" Return an open file stream for reading based on source.
If source is a file name, open it (after trying to find it with mtx and
gzipped mtx extensions). Otherwise, just return source.
Parameters
----------
filespec : str or file-like
String giving file name or file-like object
mode : str, optional
Mode with which to open file, if `filespec` is a file name.
Returns
-------
fobj : file-like
Open file-like object.
close_it : bool
True if the calling function should close this file when done,
false otherwise.
"""
# If 'filespec' is path-like (str, pathlib.Path, os.DirEntry, other class
# implementing a '__fspath__' method), try to convert it to str. If this
# fails by throwing a 'TypeError', assume it's an open file handle and
# return it as-is.
try:
filespec = os.fspath(filespec)
except TypeError:
return filespec, False
# 'filespec' is definitely a str now
# open for reading
if mode[0] == 'r':
# determine filename plus extension
if not os.path.isfile(filespec):
if os.path.isfile(filespec+'.mtx'):
filespec = filespec + '.mtx'
elif os.path.isfile(filespec+'.mtx.gz'):
filespec = filespec + '.mtx.gz'
elif os.path.isfile(filespec+'.mtx.bz2'):
filespec = filespec + '.mtx.bz2'
# open filename
if filespec.endswith('.gz'):
import gzip
stream = gzip.open(filespec, mode)
elif filespec.endswith('.bz2'):
import bz2
stream = bz2.BZ2File(filespec, 'rb')
else:
stream = open(filespec, mode)
# open for writing
else:
if filespec[-4:] != '.mtx':
filespec = filespec + '.mtx'
stream = open(filespec, mode)
return stream, True
# -------------------------------------------------------------------------
@staticmethod
def _get_symmetry(a):
m, n = a.shape
if m != n:
return MMFile.SYMMETRY_GENERAL
issymm = True
isskew = True
isherm = a.dtype.char in 'FD'
# sparse input
if issparse(a):
# check if number of nonzero entries of lower and upper triangle
# matrix are equal
a = a.tocoo()
(row, col) = a.nonzero()
if (row < col).sum() != (row > col).sum():
return MMFile.SYMMETRY_GENERAL
# define iterator over symmetric pair entries
a = a.todok()
def symm_iterator():
for ((i, j), aij) in a.items():
if i > j:
aji = a[j, i]
yield (aij, aji, False)
elif i == j:
yield (aij, aij, True)
# non-sparse input
else:
# define iterator over symmetric pair entries
def symm_iterator():
for j in range(n):
for i in range(j, n):
aij, aji = a[i][j], a[j][i]
yield (aij, aji, i == j)
# check for symmetry
# yields aij, aji, is_diagonal
for (aij, aji, is_diagonal) in symm_iterator():
if isskew and is_diagonal and aij != 0:
isskew = False
else:
if issymm and aij != aji:
issymm = False
with np.errstate(over="ignore"):
# This can give a warning for uint dtypes, so silence that
if isskew and aij != -aji:
isskew = False
if isherm and aij != conj(aji):
isherm = False
if not (issymm or isskew or isherm):
break
# return symmetry value
if issymm:
return MMFile.SYMMETRY_SYMMETRIC
if isskew:
return MMFile.SYMMETRY_SKEW_SYMMETRIC
if isherm:
return MMFile.SYMMETRY_HERMITIAN
return MMFile.SYMMETRY_GENERAL
# -------------------------------------------------------------------------
@staticmethod
def _field_template(field, precision):
return {MMFile.FIELD_REAL: '%%.%ie\n' % precision,
MMFile.FIELD_INTEGER: '%i\n',
MMFile.FIELD_UNSIGNED: '%u\n',
MMFile.FIELD_COMPLEX: '%%.%ie %%.%ie\n' %
(precision, precision)
}.get(field, None)
# -------------------------------------------------------------------------
def __init__(self, **kwargs):
self._init_attrs(**kwargs)
# -------------------------------------------------------------------------
def read(self, source):
"""
Reads the contents of a Matrix Market file-like 'source' into a matrix.
Parameters
----------
source : str or file-like
Matrix Market filename (extensions .mtx, .mtz.gz)
or open file object.
Returns
-------
a : ndarray or coo_matrix
Dense or sparse matrix depending on the matrix format in the
Matrix Market file.
"""
stream, close_it = self._open(source)
try:
self._parse_header(stream)
return self._parse_body(stream)
finally:
if close_it:
stream.close()
# -------------------------------------------------------------------------
def write(self, target, a, comment='', field=None, precision=None,
symmetry=None):
"""
Writes sparse or dense array `a` to Matrix Market file-like `target`.
Parameters
----------
target : str or file-like
Matrix Market filename (extension .mtx) or open file-like object.
a : array like
Sparse or dense 2-D array.
comment : str, optional
Comments to be prepended to the Matrix Market file.
field : None or str, optional
Either 'real', 'complex', 'pattern', or 'integer'.
precision : None or int, optional
Number of digits to display for real or complex values.
symmetry : None or str, optional
Either 'general', 'symmetric', 'skew-symmetric', or 'hermitian'.
If symmetry is None the symmetry type of 'a' is determined by its
values.
"""
stream, close_it = self._open(target, 'wb')
try:
self._write(stream, a, comment, field, precision, symmetry)
finally:
if close_it:
stream.close()
else:
stream.flush()
# -------------------------------------------------------------------------
def _init_attrs(self, **kwargs):
"""
Initialize each attributes with the corresponding keyword arg value
or a default of None
"""
attrs = self.__class__.__slots__
public_attrs = [attr[1:] for attr in attrs]
invalid_keys = set(kwargs.keys()) - set(public_attrs)
if invalid_keys:
raise ValueError('''found {} invalid keyword arguments, please only
use {}'''.format(tuple(invalid_keys),
public_attrs))
for attr in attrs:
setattr(self, attr, kwargs.get(attr[1:], None))
# -------------------------------------------------------------------------
def _parse_header(self, stream):
rows, cols, entries, format, field, symmetry = \
self.__class__.info(stream)
self._init_attrs(rows=rows, cols=cols, entries=entries, format=format,
field=field, symmetry=symmetry)
# -------------------------------------------------------------------------
def _parse_body(self, stream):
rows, cols, entries, format, field, symm = (self.rows, self.cols,
self.entries, self.format,
self.field, self.symmetry)
dtype = self.DTYPES_BY_FIELD.get(field, None)
has_symmetry = self.has_symmetry
is_integer = field == self.FIELD_INTEGER
is_unsigned_integer = field == self.FIELD_UNSIGNED
is_complex = field == self.FIELD_COMPLEX
is_skew = symm == self.SYMMETRY_SKEW_SYMMETRIC
is_herm = symm == self.SYMMETRY_HERMITIAN
is_pattern = field == self.FIELD_PATTERN
if format == self.FORMAT_ARRAY:
a = zeros((rows, cols), dtype=dtype)
line = 1
i, j = 0, 0
if is_skew:
a[i, j] = 0
if i < rows - 1:
i += 1
while line:
line = stream.readline()
# line.startswith('%')
if not line or line[0] in ['%', 37] or not line.strip():
continue
if is_integer:
aij = int(line)
elif is_unsigned_integer:
aij = int(line)
elif is_complex:
aij = complex(*map(float, line.split()))
else:
aij = float(line)
a[i, j] = aij
if has_symmetry and i != j:
if is_skew:
a[j, i] = -aij
elif is_herm:
a[j, i] = conj(aij)
else:
a[j, i] = aij
if i < rows-1:
i = i + 1
else:
j = j + 1
if not has_symmetry:
i = 0
else:
i = j
if is_skew:
a[i, j] = 0
if i < rows-1:
i += 1
if is_skew:
if not (i in [0, j] and j == cols - 1):
raise ValueError("Parse error, did not read all lines.")
else:
if not (i in [0, j] and j == cols):
raise ValueError("Parse error, did not read all lines.")
elif format == self.FORMAT_COORDINATE:
# Read sparse COOrdinate format
if entries == 0:
# empty matrix
return coo_matrix((rows, cols), dtype=dtype)
I = zeros(entries, dtype='intc')
J = zeros(entries, dtype='intc')
if is_pattern:
V = ones(entries, dtype='int8')
elif is_integer:
V = zeros(entries, dtype='intp')
elif is_unsigned_integer:
V = zeros(entries, dtype='uint64')
elif is_complex:
V = zeros(entries, dtype='complex')
else:
V = zeros(entries, dtype='float')
entry_number = 0
for line in stream:
# line.startswith('%')
if not line or line[0] in ['%', 37] or not line.strip():
continue
if entry_number+1 > entries:
raise ValueError("'entries' in header is smaller than "
"number of entries")
l = line.split()
I[entry_number], J[entry_number] = map(int, l[:2])
if not is_pattern:
if is_integer:
V[entry_number] = int(l[2])
elif is_unsigned_integer:
V[entry_number] = int(l[2])
elif is_complex:
V[entry_number] = complex(*map(float, l[2:]))
else:
V[entry_number] = float(l[2])
entry_number += 1
if entry_number < entries:
raise ValueError("'entries' in header is larger than "
"number of entries")
I -= 1 # adjust indices (base 1 -> base 0)
J -= 1
if has_symmetry:
mask = (I != J) # off diagonal mask
od_I = I[mask]
od_J = J[mask]
od_V = V[mask]
I = concatenate((I, od_J))
J = concatenate((J, od_I))
if is_skew:
od_V *= -1
elif is_herm:
od_V = od_V.conjugate()
V = concatenate((V, od_V))
a = coo_matrix((V, (I, J)), shape=(rows, cols), dtype=dtype)
else:
raise NotImplementedError(format)
return a
# ------------------------------------------------------------------------
def _write(self, stream, a, comment='', field=None, precision=None,
symmetry=None):
if isinstance(a, list) or isinstance(a, ndarray) or \
isinstance(a, tuple) or hasattr(a, '__array__'):
rep = self.FORMAT_ARRAY
a = asarray(a)
if len(a.shape) != 2:
raise ValueError('Expected 2 dimensional array')
rows, cols = a.shape
if field is not None:
if field == self.FIELD_INTEGER:
if not can_cast(a.dtype, 'intp'):
raise OverflowError("mmwrite does not support integer "
"dtypes larger than native 'intp'.")
a = a.astype('intp')
elif field == self.FIELD_REAL:
if a.dtype.char not in 'fd':
a = a.astype('d')
elif field == self.FIELD_COMPLEX:
if a.dtype.char not in 'FD':
a = a.astype('D')
else:
if not issparse(a):
raise ValueError('unknown matrix type: %s' % type(a))
rep = 'coordinate'
rows, cols = a.shape
typecode = a.dtype.char
if precision is None:
if typecode in 'fF':
precision = 8
else:
precision = 16
if field is None:
kind = a.dtype.kind
if kind == 'i':
if not can_cast(a.dtype, 'intp'):
raise OverflowError("mmwrite does not support integer "
"dtypes larger than native 'intp'.")
field = 'integer'
elif kind == 'f':
field = 'real'
elif kind == 'c':
field = 'complex'
elif kind == 'u':
field = 'unsigned-integer'
else:
raise TypeError('unexpected dtype kind ' + kind)
if symmetry is None:
symmetry = self._get_symmetry(a)
# validate rep, field, and symmetry
self.__class__._validate_format(rep)
self.__class__._validate_field(field)
self.__class__._validate_symmetry(symmetry)
# write initial header line
data = f'%%MatrixMarket matrix {rep} {field} {symmetry}\n'
stream.write(data.encode('latin1'))
# write comments
for line in comment.split('\n'):
data = '%%%s\n' % (line)
stream.write(data.encode('latin1'))
template = self._field_template(field, precision)
# write dense format
if rep == self.FORMAT_ARRAY:
# write shape spec
data = '%i %i\n' % (rows, cols)
stream.write(data.encode('latin1'))
if field in (self.FIELD_INTEGER, self.FIELD_REAL,
self.FIELD_UNSIGNED):
if symmetry == self.SYMMETRY_GENERAL:
for j in range(cols):
for i in range(rows):
data = template % a[i, j]
stream.write(data.encode('latin1'))
elif symmetry == self.SYMMETRY_SKEW_SYMMETRIC:
for j in range(cols):
for i in range(j + 1, rows):
data = template % a[i, j]
stream.write(data.encode('latin1'))
else:
for j in range(cols):
for i in range(j, rows):
data = template % a[i, j]
stream.write(data.encode('latin1'))
elif field == self.FIELD_COMPLEX:
if symmetry == self.SYMMETRY_GENERAL:
for j in range(cols):
for i in range(rows):
aij = a[i, j]
data = template % (real(aij), imag(aij))
stream.write(data.encode('latin1'))
else:
for j in range(cols):
for i in range(j, rows):
aij = a[i, j]
data = template % (real(aij), imag(aij))
stream.write(data.encode('latin1'))
elif field == self.FIELD_PATTERN:
raise ValueError('pattern type inconsisted with dense format')
else:
raise TypeError('Unknown field type %s' % field)
# write sparse format
else:
coo = a.tocoo() # convert to COOrdinate format
# if symmetry format used, remove values above main diagonal
if symmetry != self.SYMMETRY_GENERAL:
lower_triangle_mask = coo.row >= coo.col
coo = coo_matrix((coo.data[lower_triangle_mask],
(coo.row[lower_triangle_mask],
coo.col[lower_triangle_mask])),
shape=coo.shape)
# write shape spec
data = '%i %i %i\n' % (rows, cols, coo.nnz)
stream.write(data.encode('latin1'))
template = self._field_template(field, precision-1)
if field == self.FIELD_PATTERN:
for r, c in zip(coo.row+1, coo.col+1):
data = "%i %i\n" % (r, c)
stream.write(data.encode('latin1'))
elif field in (self.FIELD_INTEGER, self.FIELD_REAL,
self.FIELD_UNSIGNED):
for r, c, d in zip(coo.row+1, coo.col+1, coo.data):
data = ("%i %i " % (r, c)) + (template % d)
stream.write(data.encode('latin1'))
elif field == self.FIELD_COMPLEX:
for r, c, d in zip(coo.row+1, coo.col+1, coo.data):
data = ("%i %i " % (r, c)) + (template % (d.real, d.imag))
stream.write(data.encode('latin1'))
else:
raise TypeError('Unknown field type %s' % field)
def _is_fromfile_compatible(stream):
"""
Check whether `stream` is compatible with numpy.fromfile.
Passing a gzipped file object to ``fromfile/fromstring`` doesn't work with
Python 3.
"""
bad_cls = []
try:
import gzip
bad_cls.append(gzip.GzipFile)
except ImportError:
pass
try:
import bz2
bad_cls.append(bz2.BZ2File)
except ImportError:
pass
bad_cls = tuple(bad_cls)
return not isinstance(stream, bad_cls)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,28 @@
"""
Module to read ARFF files
=========================
ARFF is the standard data format for WEKA.
It is a text file format which support numerical, string and data values.
The format can also represent missing data and sparse data.
Notes
-----
The ARFF support in ``scipy.io`` provides file reading functionality only.
For more extensive ARFF functionality, see `liac-arff
<https://github.com/renatopp/liac-arff>`_.
See the `WEKA website <http://weka.wikispaces.com/ARFF>`_
for more details about the ARFF format and available datasets.
"""
from ._arffread import *
from . import _arffread
# Deprecated namespaces, to be removed in v2.0.0
from .import arffread
__all__ = _arffread.__all__ + ['arffread']
from scipy._lib._testutils import PytestTester
test = PytestTester(__name__)
del PytestTester

View File

@ -0,0 +1,907 @@
# Last Change: Mon Aug 20 08:00 PM 2007 J
import re
import datetime
import numpy as np
import csv
import ctypes
"""A module to read arff files."""
__all__ = ['MetaData', 'loadarff', 'ArffError', 'ParseArffError']
# An Arff file is basically two parts:
# - header
# - data
#
# A header has each of its components starting by @META where META is one of
# the keyword (attribute of relation, for now).
# TODO:
# - both integer and reals are treated as numeric -> the integer info
# is lost!
# - Replace ValueError by ParseError or something
# We know can handle the following:
# - numeric and nominal attributes
# - missing values for numeric attributes
r_meta = re.compile(r'^\s*@')
# Match a comment
r_comment = re.compile(r'^%')
# Match an empty line
r_empty = re.compile(r'^\s+$')
# Match a header line, that is a line which starts by @ + a word
r_headerline = re.compile(r'^\s*@\S*')
r_datameta = re.compile(r'^@[Dd][Aa][Tt][Aa]')
r_relation = re.compile(r'^@[Rr][Ee][Ll][Aa][Tt][Ii][Oo][Nn]\s*(\S*)')
r_attribute = re.compile(r'^\s*@[Aa][Tt][Tt][Rr][Ii][Bb][Uu][Tt][Ee]\s*(..*$)')
r_nominal = re.compile(r'{(.+)}')
r_date = re.compile(r"[Dd][Aa][Tt][Ee]\s+[\"']?(.+?)[\"']?$")
# To get attributes name enclosed with ''
r_comattrval = re.compile(r"'(..+)'\s+(..+$)")
# To get normal attributes
r_wcomattrval = re.compile(r"(\S+)\s+(..+$)")
# ------------------------
# Module defined exception
# ------------------------
class ArffError(OSError):
pass
class ParseArffError(ArffError):
pass
# ----------
# Attributes
# ----------
class Attribute:
type_name = None
def __init__(self, name):
self.name = name
self.range = None
self.dtype = np.object_
@classmethod
def parse_attribute(cls, name, attr_string):
"""
Parse the attribute line if it knows how. Returns the parsed
attribute, or None.
"""
return None
def parse_data(self, data_str):
"""
Parse a value of this type.
"""
return None
def __str__(self):
"""
Parse a value of this type.
"""
return self.name + ',' + self.type_name
class NominalAttribute(Attribute):
type_name = 'nominal'
def __init__(self, name, values):
super().__init__(name)
self.values = values
self.range = values
self.dtype = (np.bytes_, max(len(i) for i in values))
@staticmethod
def _get_nom_val(atrv):
"""Given a string containing a nominal type, returns a tuple of the
possible values.
A nominal type is defined as something framed between braces ({}).
Parameters
----------
atrv : str
Nominal type definition
Returns
-------
poss_vals : tuple
possible values
Examples
--------
>>> from scipy.io.arff._arffread import NominalAttribute
>>> NominalAttribute._get_nom_val("{floup, bouga, fl, ratata}")
('floup', 'bouga', 'fl', 'ratata')
"""
m = r_nominal.match(atrv)
if m:
attrs, _ = split_data_line(m.group(1))
return tuple(attrs)
else:
raise ValueError("This does not look like a nominal string")
@classmethod
def parse_attribute(cls, name, attr_string):
"""
Parse the attribute line if it knows how. Returns the parsed
attribute, or None.
For nominal attributes, the attribute string would be like '{<attr_1>,
<attr2>, <attr_3>}'.
"""
if attr_string[0] == '{':
values = cls._get_nom_val(attr_string)
return cls(name, values)
else:
return None
def parse_data(self, data_str):
"""
Parse a value of this type.
"""
if data_str in self.values:
return data_str
elif data_str == '?':
return data_str
else:
raise ValueError(f"{str(data_str)} value not in {str(self.values)}")
def __str__(self):
msg = self.name + ",{"
for i in range(len(self.values)-1):
msg += self.values[i] + ","
msg += self.values[-1]
msg += "}"
return msg
class NumericAttribute(Attribute):
def __init__(self, name):
super().__init__(name)
self.type_name = 'numeric'
self.dtype = np.float64
@classmethod
def parse_attribute(cls, name, attr_string):
"""
Parse the attribute line if it knows how. Returns the parsed
attribute, or None.
For numeric attributes, the attribute string would be like
'numeric' or 'int' or 'real'.
"""
attr_string = attr_string.lower().strip()
if (attr_string[:len('numeric')] == 'numeric' or
attr_string[:len('int')] == 'int' or
attr_string[:len('real')] == 'real'):
return cls(name)
else:
return None
def parse_data(self, data_str):
"""
Parse a value of this type.
Parameters
----------
data_str : str
string to convert
Returns
-------
f : float
where float can be nan
Examples
--------
>>> from scipy.io.arff._arffread import NumericAttribute
>>> atr = NumericAttribute('atr')
>>> atr.parse_data('1')
1.0
>>> atr.parse_data('1\\n')
1.0
>>> atr.parse_data('?\\n')
nan
"""
if '?' in data_str:
return np.nan
else:
return float(data_str)
def _basic_stats(self, data):
nbfac = data.size * 1. / (data.size - 1)
return (np.nanmin(data), np.nanmax(data),
np.mean(data), np.std(data) * nbfac)
class StringAttribute(Attribute):
def __init__(self, name):
super().__init__(name)
self.type_name = 'string'
@classmethod
def parse_attribute(cls, name, attr_string):
"""
Parse the attribute line if it knows how. Returns the parsed
attribute, or None.
For string attributes, the attribute string would be like
'string'.
"""
attr_string = attr_string.lower().strip()
if attr_string[:len('string')] == 'string':
return cls(name)
else:
return None
class DateAttribute(Attribute):
def __init__(self, name, date_format, datetime_unit):
super().__init__(name)
self.date_format = date_format
self.datetime_unit = datetime_unit
self.type_name = 'date'
self.range = date_format
self.dtype = np.datetime64(0, self.datetime_unit)
@staticmethod
def _get_date_format(atrv):
m = r_date.match(atrv)
if m:
pattern = m.group(1).strip()
# convert time pattern from Java's SimpleDateFormat to C's format
datetime_unit = None
if "yyyy" in pattern:
pattern = pattern.replace("yyyy", "%Y")
datetime_unit = "Y"
elif "yy":
pattern = pattern.replace("yy", "%y")
datetime_unit = "Y"
if "MM" in pattern:
pattern = pattern.replace("MM", "%m")
datetime_unit = "M"
if "dd" in pattern:
pattern = pattern.replace("dd", "%d")
datetime_unit = "D"
if "HH" in pattern:
pattern = pattern.replace("HH", "%H")
datetime_unit = "h"
if "mm" in pattern:
pattern = pattern.replace("mm", "%M")
datetime_unit = "m"
if "ss" in pattern:
pattern = pattern.replace("ss", "%S")
datetime_unit = "s"
if "z" in pattern or "Z" in pattern:
raise ValueError("Date type attributes with time zone not "
"supported, yet")
if datetime_unit is None:
raise ValueError("Invalid or unsupported date format")
return pattern, datetime_unit
else:
raise ValueError("Invalid or no date format")
@classmethod
def parse_attribute(cls, name, attr_string):
"""
Parse the attribute line if it knows how. Returns the parsed
attribute, or None.
For date attributes, the attribute string would be like
'date <format>'.
"""
attr_string_lower = attr_string.lower().strip()
if attr_string_lower[:len('date')] == 'date':
date_format, datetime_unit = cls._get_date_format(attr_string)
return cls(name, date_format, datetime_unit)
else:
return None
def parse_data(self, data_str):
"""
Parse a value of this type.
"""
date_str = data_str.strip().strip("'").strip('"')
if date_str == '?':
return np.datetime64('NaT', self.datetime_unit)
else:
dt = datetime.datetime.strptime(date_str, self.date_format)
return np.datetime64(dt).astype(
"datetime64[%s]" % self.datetime_unit)
def __str__(self):
return super().__str__() + ',' + self.date_format
class RelationalAttribute(Attribute):
def __init__(self, name):
super().__init__(name)
self.type_name = 'relational'
self.dtype = np.object_
self.attributes = []
self.dialect = None
@classmethod
def parse_attribute(cls, name, attr_string):
"""
Parse the attribute line if it knows how. Returns the parsed
attribute, or None.
For date attributes, the attribute string would be like
'date <format>'.
"""
attr_string_lower = attr_string.lower().strip()
if attr_string_lower[:len('relational')] == 'relational':
return cls(name)
else:
return None
def parse_data(self, data_str):
# Copy-pasted
elems = list(range(len(self.attributes)))
escaped_string = data_str.encode().decode("unicode-escape")
row_tuples = []
for raw in escaped_string.split("\n"):
row, self.dialect = split_data_line(raw, self.dialect)
row_tuples.append(tuple(
[self.attributes[i].parse_data(row[i]) for i in elems]))
return np.array(row_tuples,
[(a.name, a.dtype) for a in self.attributes])
def __str__(self):
return (super().__str__() + '\n\t' +
'\n\t'.join(str(a) for a in self.attributes))
# -----------------
# Various utilities
# -----------------
def to_attribute(name, attr_string):
attr_classes = (NominalAttribute, NumericAttribute, DateAttribute,
StringAttribute, RelationalAttribute)
for cls in attr_classes:
attr = cls.parse_attribute(name, attr_string)
if attr is not None:
return attr
raise ParseArffError("unknown attribute %s" % attr_string)
def csv_sniffer_has_bug_last_field():
"""
Checks if the bug https://bugs.python.org/issue30157 is unpatched.
"""
# We only compute this once.
has_bug = getattr(csv_sniffer_has_bug_last_field, "has_bug", None)
if has_bug is None:
dialect = csv.Sniffer().sniff("3, 'a'")
csv_sniffer_has_bug_last_field.has_bug = dialect.quotechar != "'"
has_bug = csv_sniffer_has_bug_last_field.has_bug
return has_bug
def workaround_csv_sniffer_bug_last_field(sniff_line, dialect, delimiters):
"""
Workaround for the bug https://bugs.python.org/issue30157 if is unpatched.
"""
if csv_sniffer_has_bug_last_field():
# Reuses code from the csv module
right_regex = r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)' # noqa: E501
for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?", # noqa: E501
r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # .*?", # noqa: E501
right_regex, # ,".*?"
r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space) # noqa: E501
regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
matches = regexp.findall(sniff_line)
if matches:
break
# If it does not match the expression that was bugged,
# then this bug does not apply
if restr != right_regex:
return
groupindex = regexp.groupindex
# There is only one end of the string
assert len(matches) == 1
m = matches[0]
n = groupindex['quote'] - 1
quote = m[n]
n = groupindex['delim'] - 1
delim = m[n]
n = groupindex['space'] - 1
space = bool(m[n])
dq_regexp = re.compile(
rf"(({re.escape(delim)})|^)\W*{quote}[^{re.escape(delim)}\n]*{quote}[^{re.escape(delim)}\n]*{quote}\W*(({re.escape(delim)})|$)", re.MULTILINE # noqa: E501
)
doublequote = bool(dq_regexp.search(sniff_line))
dialect.quotechar = quote
if delim in delimiters:
dialect.delimiter = delim
dialect.doublequote = doublequote
dialect.skipinitialspace = space
def split_data_line(line, dialect=None):
delimiters = ",\t"
# This can not be done in a per reader basis, and relational fields
# can be HUGE
csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))
# Remove the line end if any
if line[-1] == '\n':
line = line[:-1]
# Remove potential trailing whitespace
line = line.strip()
sniff_line = line
# Add a delimiter if none is present, so that the csv.Sniffer
# does not complain for a single-field CSV.
if not any(d in line for d in delimiters):
sniff_line += ","
if dialect is None:
dialect = csv.Sniffer().sniff(sniff_line, delimiters=delimiters)
workaround_csv_sniffer_bug_last_field(sniff_line=sniff_line,
dialect=dialect,
delimiters=delimiters)
row = next(csv.reader([line], dialect))
return row, dialect
# --------------
# Parsing header
# --------------
def tokenize_attribute(iterable, attribute):
"""Parse a raw string in header (e.g., starts by @attribute).
Given a raw string attribute, try to get the name and type of the
attribute. Constraints:
* The first line must start with @attribute (case insensitive, and
space like characters before @attribute are allowed)
* Works also if the attribute is spread on multilines.
* Works if empty lines or comments are in between
Parameters
----------
attribute : str
the attribute string.
Returns
-------
name : str
name of the attribute
value : str
value of the attribute
next : str
next line to be parsed
Examples
--------
If attribute is a string defined in python as r"floupi real", will
return floupi as name, and real as value.
>>> from scipy.io.arff._arffread import tokenize_attribute
>>> iterable = iter([0] * 10) # dummy iterator
>>> tokenize_attribute(iterable, r"@attribute floupi real")
('floupi', 'real', 0)
If attribute is r"'floupi 2' real", will return 'floupi 2' as name,
and real as value.
>>> tokenize_attribute(iterable, r" @attribute 'floupi 2' real ")
('floupi 2', 'real', 0)
"""
sattr = attribute.strip()
mattr = r_attribute.match(sattr)
if mattr:
# atrv is everything after @attribute
atrv = mattr.group(1)
if r_comattrval.match(atrv):
name, type = tokenize_single_comma(atrv)
next_item = next(iterable)
elif r_wcomattrval.match(atrv):
name, type = tokenize_single_wcomma(atrv)
next_item = next(iterable)
else:
# Not sure we should support this, as it does not seem supported by
# weka.
raise ValueError("multi line not supported yet")
else:
raise ValueError("First line unparsable: %s" % sattr)
attribute = to_attribute(name, type)
if type.lower() == 'relational':
next_item = read_relational_attribute(iterable, attribute, next_item)
# raise ValueError("relational attributes not supported yet")
return attribute, next_item
def tokenize_single_comma(val):
# XXX we match twice the same string (here and at the caller level). It is
# stupid, but it is easier for now...
m = r_comattrval.match(val)
if m:
try:
name = m.group(1).strip()
type = m.group(2).strip()
except IndexError as e:
raise ValueError("Error while tokenizing attribute") from e
else:
raise ValueError("Error while tokenizing single %s" % val)
return name, type
def tokenize_single_wcomma(val):
# XXX we match twice the same string (here and at the caller level). It is
# stupid, but it is easier for now...
m = r_wcomattrval.match(val)
if m:
try:
name = m.group(1).strip()
type = m.group(2).strip()
except IndexError as e:
raise ValueError("Error while tokenizing attribute") from e
else:
raise ValueError("Error while tokenizing single %s" % val)
return name, type
def read_relational_attribute(ofile, relational_attribute, i):
"""Read the nested attributes of a relational attribute"""
r_end_relational = re.compile(r'^@[Ee][Nn][Dd]\s*' +
relational_attribute.name + r'\s*$')
while not r_end_relational.match(i):
m = r_headerline.match(i)
if m:
isattr = r_attribute.match(i)
if isattr:
attr, i = tokenize_attribute(ofile, i)
relational_attribute.attributes.append(attr)
else:
raise ValueError("Error parsing line %s" % i)
else:
i = next(ofile)
i = next(ofile)
return i
def read_header(ofile):
"""Read the header of the iterable ofile."""
i = next(ofile)
# Pass first comments
while r_comment.match(i):
i = next(ofile)
# Header is everything up to DATA attribute ?
relation = None
attributes = []
while not r_datameta.match(i):
m = r_headerline.match(i)
if m:
isattr = r_attribute.match(i)
if isattr:
attr, i = tokenize_attribute(ofile, i)
attributes.append(attr)
else:
isrel = r_relation.match(i)
if isrel:
relation = isrel.group(1)
else:
raise ValueError("Error parsing line %s" % i)
i = next(ofile)
else:
i = next(ofile)
return relation, attributes
class MetaData:
"""Small container to keep useful information on a ARFF dataset.
Knows about attributes names and types.
Examples
--------
::
data, meta = loadarff('iris.arff')
# This will print the attributes names of the iris.arff dataset
for i in meta:
print(i)
# This works too
meta.names()
# Getting attribute type
types = meta.types()
Methods
-------
names
types
Notes
-----
Also maintains the list of attributes in order, i.e., doing for i in
meta, where meta is an instance of MetaData, will return the
different attribute names in the order they were defined.
"""
def __init__(self, rel, attr):
self.name = rel
self._attributes = {a.name: a for a in attr}
def __repr__(self):
msg = ""
msg += "Dataset: %s\n" % self.name
for i in self._attributes:
msg += f"\t{i}'s type is {self._attributes[i].type_name}"
if self._attributes[i].range:
msg += ", range is %s" % str(self._attributes[i].range)
msg += '\n'
return msg
def __iter__(self):
return iter(self._attributes)
def __getitem__(self, key):
attr = self._attributes[key]
return (attr.type_name, attr.range)
def names(self):
"""Return the list of attribute names.
Returns
-------
attrnames : list of str
The attribute names.
"""
return list(self._attributes)
def types(self):
"""Return the list of attribute types.
Returns
-------
attr_types : list of str
The attribute types.
"""
attr_types = [self._attributes[name].type_name
for name in self._attributes]
return attr_types
def loadarff(f):
"""
Read an arff file.
The data is returned as a record array, which can be accessed much like
a dictionary of NumPy arrays. For example, if one of the attributes is
called 'pressure', then its first 10 data points can be accessed from the
``data`` record array like so: ``data['pressure'][0:10]``
Parameters
----------
f : file-like or str
File-like object to read from, or filename to open.
Returns
-------
data : record array
The data of the arff file, accessible by attribute names.
meta : `MetaData`
Contains information about the arff file such as name and
type of attributes, the relation (name of the dataset), etc.
Raises
------
ParseArffError
This is raised if the given file is not ARFF-formatted.
NotImplementedError
The ARFF file has an attribute which is not supported yet.
Notes
-----
This function should be able to read most arff files. Not
implemented functionality include:
* date type attributes
* string type attributes
It can read files with numeric and nominal attributes. It cannot read
files with sparse data ({} in the file). However, this function can
read files with missing data (? in the file), representing the data
points as NaNs.
Examples
--------
>>> from scipy.io import arff
>>> from io import StringIO
>>> content = \"\"\"
... @relation foo
... @attribute width numeric
... @attribute height numeric
... @attribute color {red,green,blue,yellow,black}
... @data
... 5.0,3.25,blue
... 4.5,3.75,green
... 3.0,4.00,red
... \"\"\"
>>> f = StringIO(content)
>>> data, meta = arff.loadarff(f)
>>> data
array([(5.0, 3.25, 'blue'), (4.5, 3.75, 'green'), (3.0, 4.0, 'red')],
dtype=[('width', '<f8'), ('height', '<f8'), ('color', '|S6')])
>>> meta
Dataset: foo
\twidth's type is numeric
\theight's type is numeric
\tcolor's type is nominal, range is ('red', 'green', 'blue', 'yellow', 'black')
"""
if hasattr(f, 'read'):
ofile = f
else:
ofile = open(f)
try:
return _loadarff(ofile)
finally:
if ofile is not f: # only close what we opened
ofile.close()
def _loadarff(ofile):
# Parse the header file
try:
rel, attr = read_header(ofile)
except ValueError as e:
msg = "Error while parsing header, error was: " + str(e)
raise ParseArffError(msg) from e
# Check whether we have a string attribute (not supported yet)
hasstr = False
for a in attr:
if isinstance(a, StringAttribute):
hasstr = True
meta = MetaData(rel, attr)
# XXX The following code is not great
# Build the type descriptor descr and the list of converters to convert
# each attribute to the suitable type (which should match the one in
# descr).
# This can be used once we want to support integer as integer values and
# not as numeric anymore (using masked arrays ?).
if hasstr:
# How to support string efficiently ? Ideally, we should know the max
# size of the string before allocating the numpy array.
raise NotImplementedError("String attributes not supported yet, sorry")
ni = len(attr)
def generator(row_iter, delim=','):
# TODO: this is where we are spending time (~80%). I think things
# could be made more efficiently:
# - We could for example "compile" the function, because some values
# do not change here.
# - The function to convert a line to dtyped values could also be
# generated on the fly from a string and be executed instead of
# looping.
# - The regex are overkill: for comments, checking that a line starts
# by % should be enough and faster, and for empty lines, same thing
# --> this does not seem to change anything.
# 'compiling' the range since it does not change
# Note, I have already tried zipping the converters and
# row elements and got slightly worse performance.
elems = list(range(ni))
dialect = None
for raw in row_iter:
# We do not abstract skipping comments and empty lines for
# performance reasons.
if r_comment.match(raw) or r_empty.match(raw):
continue
row, dialect = split_data_line(raw, dialect)
yield tuple([attr[i].parse_data(row[i]) for i in elems])
a = list(generator(ofile))
# No error should happen here: it is a bug otherwise
data = np.array(a, [(a.name, a.dtype) for a in attr])
return data, meta
# ----
# Misc
# ----
def basic_stats(data):
nbfac = data.size * 1. / (data.size - 1)
return np.nanmin(data), np.nanmax(data), np.mean(data), np.std(data) * nbfac
def print_attribute(name, tp, data):
type = tp.type_name
if type == 'numeric' or type == 'real' or type == 'integer':
min, max, mean, std = basic_stats(data)
print(f"{name},{type},{min:f},{max:f},{mean:f},{std:f}")
else:
print(str(tp))
def test_weka(filename):
data, meta = loadarff(filename)
print(len(data.dtype))
print(data.size)
for i in meta:
print_attribute(i, meta[i], data[i])
# make sure nose does not find this as a test
test_weka.__test__ = False
if __name__ == '__main__':
import sys
filename = sys.argv[1]
test_weka(filename)

View File

@ -0,0 +1,19 @@
# This file is not meant for public use and will be removed in SciPy v2.0.0.
# Use the `scipy.io.arff` namespace for importing the functions
# included below.
from scipy._lib.deprecation import _sub_module_deprecation
__all__ = [ # noqa: F822
'MetaData', 'loadarff', 'ArffError', 'ParseArffError',
]
def __dir__():
return __all__
def __getattr__(name):
return _sub_module_deprecation(sub_package="io.arff", module="arffread",
private_modules=["_arffread"], all=__all__,
attribute=name)

View File

@ -0,0 +1,225 @@
% 1. Title: Iris Plants Database
%
% 2. Sources:
% (a) Creator: R.A. Fisher
% (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
% (c) Date: July, 1988
%
% 3. Past Usage:
% - Publications: too many to mention!!! Here are a few.
% 1. Fisher,R.A. "The use of multiple measurements in taxonomic problems"
% Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions
% to Mathematical Statistics" (John Wiley, NY, 1950).
% 2. Duda,R.O., & Hart,P.E. (1973) Pattern Classification and Scene Analysis.
% (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.
% 3. Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System
% Structure and Classification Rule for Recognition in Partially Exposed
% Environments". IEEE Transactions on Pattern Analysis and Machine
% Intelligence, Vol. PAMI-2, No. 1, 67-71.
% -- Results:
% -- very low misclassification rates (0% for the setosa class)
% 4. Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE
% Transactions on Information Theory, May 1972, 431-433.
% -- Results:
% -- very low misclassification rates again
% 5. See also: 1988 MLC Proceedings, 54-64. Cheeseman et al's AUTOCLASS II
% conceptual clustering system finds 3 classes in the data.
%
% 4. Relevant Information:
% --- This is perhaps the best known database to be found in the pattern
% recognition literature. Fisher's paper is a classic in the field
% and is referenced frequently to this day. (See Duda & Hart, for
% example.) The data set contains 3 classes of 50 instances each,
% where each class refers to a type of iris plant. One class is
% linearly separable from the other 2; the latter are NOT linearly
% separable from each other.
% --- Predicted attribute: class of iris plant.
% --- This is an exceedingly simple domain.
%
% 5. Number of Instances: 150 (50 in each of three classes)
%
% 6. Number of Attributes: 4 numeric, predictive attributes and the class
%
% 7. Attribute Information:
% 1. sepal length in cm
% 2. sepal width in cm
% 3. petal length in cm
% 4. petal width in cm
% 5. class:
% -- Iris Setosa
% -- Iris Versicolour
% -- Iris Virginica
%
% 8. Missing Attribute Values: None
%
% Summary Statistics:
% Min Max Mean SD Class Correlation
% sepal length: 4.3 7.9 5.84 0.83 0.7826
% sepal width: 2.0 4.4 3.05 0.43 -0.4194
% petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)
% petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)
%
% 9. Class Distribution: 33.3% for each of 3 classes.
@RELATION iris
@ATTRIBUTE sepallength REAL
@ATTRIBUTE sepalwidth REAL
@ATTRIBUTE petallength REAL
@ATTRIBUTE petalwidth REAL
@ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
@DATA
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa
5.0,3.6,1.4,0.2,Iris-setosa
5.4,3.9,1.7,0.4,Iris-setosa
4.6,3.4,1.4,0.3,Iris-setosa
5.0,3.4,1.5,0.2,Iris-setosa
4.4,2.9,1.4,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
5.4,3.7,1.5,0.2,Iris-setosa
4.8,3.4,1.6,0.2,Iris-setosa
4.8,3.0,1.4,0.1,Iris-setosa
4.3,3.0,1.1,0.1,Iris-setosa
5.8,4.0,1.2,0.2,Iris-setosa
5.7,4.4,1.5,0.4,Iris-setosa
5.4,3.9,1.3,0.4,Iris-setosa
5.1,3.5,1.4,0.3,Iris-setosa
5.7,3.8,1.7,0.3,Iris-setosa
5.1,3.8,1.5,0.3,Iris-setosa
5.4,3.4,1.7,0.2,Iris-setosa
5.1,3.7,1.5,0.4,Iris-setosa
4.6,3.6,1.0,0.2,Iris-setosa
5.1,3.3,1.7,0.5,Iris-setosa
4.8,3.4,1.9,0.2,Iris-setosa
5.0,3.0,1.6,0.2,Iris-setosa
5.0,3.4,1.6,0.4,Iris-setosa
5.2,3.5,1.5,0.2,Iris-setosa
5.2,3.4,1.4,0.2,Iris-setosa
4.7,3.2,1.6,0.2,Iris-setosa
4.8,3.1,1.6,0.2,Iris-setosa
5.4,3.4,1.5,0.4,Iris-setosa
5.2,4.1,1.5,0.1,Iris-setosa
5.5,4.2,1.4,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
5.0,3.2,1.2,0.2,Iris-setosa
5.5,3.5,1.3,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
4.4,3.0,1.3,0.2,Iris-setosa
5.1,3.4,1.5,0.2,Iris-setosa
5.0,3.5,1.3,0.3,Iris-setosa
4.5,2.3,1.3,0.3,Iris-setosa
4.4,3.2,1.3,0.2,Iris-setosa
5.0,3.5,1.6,0.6,Iris-setosa
5.1,3.8,1.9,0.4,Iris-setosa
4.8,3.0,1.4,0.3,Iris-setosa
5.1,3.8,1.6,0.2,Iris-setosa
4.6,3.2,1.4,0.2,Iris-setosa
5.3,3.7,1.5,0.2,Iris-setosa
5.0,3.3,1.4,0.2,Iris-setosa
7.0,3.2,4.7,1.4,Iris-versicolor
6.4,3.2,4.5,1.5,Iris-versicolor
6.9,3.1,4.9,1.5,Iris-versicolor
5.5,2.3,4.0,1.3,Iris-versicolor
6.5,2.8,4.6,1.5,Iris-versicolor
5.7,2.8,4.5,1.3,Iris-versicolor
6.3,3.3,4.7,1.6,Iris-versicolor
4.9,2.4,3.3,1.0,Iris-versicolor
6.6,2.9,4.6,1.3,Iris-versicolor
5.2,2.7,3.9,1.4,Iris-versicolor
5.0,2.0,3.5,1.0,Iris-versicolor
5.9,3.0,4.2,1.5,Iris-versicolor
6.0,2.2,4.0,1.0,Iris-versicolor
6.1,2.9,4.7,1.4,Iris-versicolor
5.6,2.9,3.6,1.3,Iris-versicolor
6.7,3.1,4.4,1.4,Iris-versicolor
5.6,3.0,4.5,1.5,Iris-versicolor
5.8,2.7,4.1,1.0,Iris-versicolor
6.2,2.2,4.5,1.5,Iris-versicolor
5.6,2.5,3.9,1.1,Iris-versicolor
5.9,3.2,4.8,1.8,Iris-versicolor
6.1,2.8,4.0,1.3,Iris-versicolor
6.3,2.5,4.9,1.5,Iris-versicolor
6.1,2.8,4.7,1.2,Iris-versicolor
6.4,2.9,4.3,1.3,Iris-versicolor
6.6,3.0,4.4,1.4,Iris-versicolor
6.8,2.8,4.8,1.4,Iris-versicolor
6.7,3.0,5.0,1.7,Iris-versicolor
6.0,2.9,4.5,1.5,Iris-versicolor
5.7,2.6,3.5,1.0,Iris-versicolor
5.5,2.4,3.8,1.1,Iris-versicolor
5.5,2.4,3.7,1.0,Iris-versicolor
5.8,2.7,3.9,1.2,Iris-versicolor
6.0,2.7,5.1,1.6,Iris-versicolor
5.4,3.0,4.5,1.5,Iris-versicolor
6.0,3.4,4.5,1.6,Iris-versicolor
6.7,3.1,4.7,1.5,Iris-versicolor
6.3,2.3,4.4,1.3,Iris-versicolor
5.6,3.0,4.1,1.3,Iris-versicolor
5.5,2.5,4.0,1.3,Iris-versicolor
5.5,2.6,4.4,1.2,Iris-versicolor
6.1,3.0,4.6,1.4,Iris-versicolor
5.8,2.6,4.0,1.2,Iris-versicolor
5.0,2.3,3.3,1.0,Iris-versicolor
5.6,2.7,4.2,1.3,Iris-versicolor
5.7,3.0,4.2,1.2,Iris-versicolor
5.7,2.9,4.2,1.3,Iris-versicolor
6.2,2.9,4.3,1.3,Iris-versicolor
5.1,2.5,3.0,1.1,Iris-versicolor
5.7,2.8,4.1,1.3,Iris-versicolor
6.3,3.3,6.0,2.5,Iris-virginica
5.8,2.7,5.1,1.9,Iris-virginica
7.1,3.0,5.9,2.1,Iris-virginica
6.3,2.9,5.6,1.8,Iris-virginica
6.5,3.0,5.8,2.2,Iris-virginica
7.6,3.0,6.6,2.1,Iris-virginica
4.9,2.5,4.5,1.7,Iris-virginica
7.3,2.9,6.3,1.8,Iris-virginica
6.7,2.5,5.8,1.8,Iris-virginica
7.2,3.6,6.1,2.5,Iris-virginica
6.5,3.2,5.1,2.0,Iris-virginica
6.4,2.7,5.3,1.9,Iris-virginica
6.8,3.0,5.5,2.1,Iris-virginica
5.7,2.5,5.0,2.0,Iris-virginica
5.8,2.8,5.1,2.4,Iris-virginica
6.4,3.2,5.3,2.3,Iris-virginica
6.5,3.0,5.5,1.8,Iris-virginica
7.7,3.8,6.7,2.2,Iris-virginica
7.7,2.6,6.9,2.3,Iris-virginica
6.0,2.2,5.0,1.5,Iris-virginica
6.9,3.2,5.7,2.3,Iris-virginica
5.6,2.8,4.9,2.0,Iris-virginica
7.7,2.8,6.7,2.0,Iris-virginica
6.3,2.7,4.9,1.8,Iris-virginica
6.7,3.3,5.7,2.1,Iris-virginica
7.2,3.2,6.0,1.8,Iris-virginica
6.2,2.8,4.8,1.8,Iris-virginica
6.1,3.0,4.9,1.8,Iris-virginica
6.4,2.8,5.6,2.1,Iris-virginica
7.2,3.0,5.8,1.6,Iris-virginica
7.4,2.8,6.1,1.9,Iris-virginica
7.9,3.8,6.4,2.0,Iris-virginica
6.4,2.8,5.6,2.2,Iris-virginica
6.3,2.8,5.1,1.5,Iris-virginica
6.1,2.6,5.6,1.4,Iris-virginica
7.7,3.0,6.1,2.3,Iris-virginica
6.3,3.4,5.6,2.4,Iris-virginica
6.4,3.1,5.5,1.8,Iris-virginica
6.0,3.0,4.8,1.8,Iris-virginica
6.9,3.1,5.4,2.1,Iris-virginica
6.7,3.1,5.6,2.4,Iris-virginica
6.9,3.1,5.1,2.3,Iris-virginica
5.8,2.7,5.1,1.9,Iris-virginica
6.8,3.2,5.9,2.3,Iris-virginica
6.7,3.3,5.7,2.5,Iris-virginica
6.7,3.0,5.2,2.3,Iris-virginica
6.3,2.5,5.0,1.9,Iris-virginica
6.5,3.0,5.2,2.0,Iris-virginica
6.2,3.4,5.4,2.3,Iris-virginica
5.9,3.0,5.1,1.8,Iris-virginica
%
%
%

View File

@ -0,0 +1,8 @@
% This arff file contains some missing data
@relation missing
@attribute yop real
@attribute yap real
@data
1,5
2,4
?,?

View File

@ -0,0 +1,11 @@
@RELATION iris
@ATTRIBUTE sepallength REAL
@ATTRIBUTE sepalwidth REAL
@ATTRIBUTE petallength REAL
@ATTRIBUTE petalwidth REAL
@ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
@DATA
% This file has no data

View File

@ -0,0 +1,13 @@
% Regression test for issue #10232 : Exception in loadarff with quoted nominal attributes
% Spaces between elements are stripped by the parser
@relation SOME_DATA
@attribute age numeric
@attribute smoker {'yes', 'no'}
@data
18, 'no'
24, 'yes'
44, 'no'
56, 'no'
89,'yes'
11, 'no'

View File

@ -0,0 +1,13 @@
% Regression test for issue #10232 : Exception in loadarff with quoted nominal attributes
% Spaces inside quotes are NOT stripped by the parser
@relation SOME_DATA
@attribute age numeric
@attribute smoker {' yes', 'no '}
@data
18,'no '
24,' yes'
44,'no '
56,'no '
89,' yes'
11,'no '

View File

@ -0,0 +1,10 @@
@RELATION test1
@ATTRIBUTE attr0 REAL
@ATTRIBUTE attr1 REAL
@ATTRIBUTE attr2 REAL
@ATTRIBUTE attr3 REAL
@ATTRIBUTE class {class0, class1, class2, class3}
@DATA
0.1, 0.2, 0.3, 0.4,class1

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,11 @@
@RELATION test11
@ATTRIBUTE attr0 REAL
@ATTRIBUTE attr1 REAL
@ATTRIBUTE attr2 REAL
@ATTRIBUTE attr3 REAL
@ATTRIBUTE class { class0, class1, class2, class3 }
@DATA
0.1, 0.2, 0.3, 0.4,class1
-0.1, -0.2, -0.3, -0.4,class2
1, 2, 3, 4,class3

View File

@ -0,0 +1,15 @@
@RELATION test2
@ATTRIBUTE attr0 REAL
@ATTRIBUTE attr1 real
@ATTRIBUTE attr2 integer
@ATTRIBUTE attr3 Integer
@ATTRIBUTE attr4 Numeric
@ATTRIBUTE attr5 numeric
@ATTRIBUTE attr6 string
@ATTRIBUTE attr7 STRING
@ATTRIBUTE attr8 {bla}
@ATTRIBUTE attr9 {bla, bla}
@DATA
0.1, 0.2, 0.3, 0.4,class1

View File

@ -0,0 +1,6 @@
@RELATION test3
@ATTRIBUTE attr0 crap
@DATA
0.1, 0.2, 0.3, 0.4,class1

View File

@ -0,0 +1,11 @@
@RELATION test5
@ATTRIBUTE attr0 REAL
@ATTRIBUTE attr1 REAL
@ATTRIBUTE attr2 REAL
@ATTRIBUTE attr3 REAL
@ATTRIBUTE class {class0, class1, class2, class3}
@DATA
0.1, 0.2, 0.3, 0.4,class1
-0.1, -0.2, -0.3, -0.4,class2
1, 2, 3, 4,class3

View File

@ -0,0 +1,26 @@
@RELATION test4
@ATTRIBUTE attr0 REAL
@ATTRIBUTE attr1 REAL
@ATTRIBUTE attr2 REAL
@ATTRIBUTE attr3 REAL
@ATTRIBUTE class {class0, class1, class2, class3}
@DATA
% lsdflkjhaksjdhf
% lsdflkjhaksjdhf
0.1, 0.2, 0.3, 0.4,class1
% laksjdhf
% lsdflkjhaksjdhf
-0.1, -0.2, -0.3, -0.4,class2
% lsdflkjhaksjdhf
% lsdflkjhaksjdhf
% lsdflkjhaksjdhf
1, 2, 3, 4,class3

View File

@ -0,0 +1,12 @@
@RELATION test6
@ATTRIBUTE attr0 REAL
@ATTRIBUTE attr1 REAL
@ATTRIBUTE attr2 REAL
@ATTRIBUTE attr3 REAL
@ATTRIBUTE class {C}
@DATA
0.1, 0.2, 0.3, 0.4,C
-0.1, -0.2, -0.3, -0.4,C
1, 2, 3, 4,C

View File

@ -0,0 +1,15 @@
@RELATION test7
@ATTRIBUTE attr_year DATE yyyy
@ATTRIBUTE attr_month DATE yyyy-MM
@ATTRIBUTE attr_date DATE yyyy-MM-dd
@ATTRIBUTE attr_datetime_local DATE "yyyy-MM-dd HH:mm"
@ATTRIBUTE attr_datetime_missing DATE "yyyy-MM-dd HH:mm"
@DATA
1999,1999-01,1999-01-31,"1999-01-31 00:01",?
2004,2004-12,2004-12-01,"2004-12-01 23:59","2004-12-01 23:59"
1817,1817-04,1817-04-28,"1817-04-28 13:00",?
2100,2100-09,2100-09-10,"2100-09-10 12:00",?
2013,2013-11,2013-11-30,"2013-11-30 04:55","2013-11-30 04:55"
1631,1631-10,1631-10-15,"1631-10-15 20:04","1631-10-15 20:04"

View File

@ -0,0 +1,12 @@
@RELATION test8
@ATTRIBUTE attr_datetime_utc DATE "yyyy-MM-dd HH:mm Z"
@ATTRIBUTE attr_datetime_full DATE "yy-MM-dd HH:mm:ss z"
@DATA
"1999-01-31 00:01 UTC","99-01-31 00:01:08 +0430"
"2004-12-01 23:59 UTC","04-12-01 23:59:59 -0800"
"1817-04-28 13:00 UTC","17-04-28 13:00:33 +1000"
"2100-09-10 12:00 UTC","21-09-10 12:00:21 -0300"
"2013-11-30 04:55 UTC","13-11-30 04:55:48 -1100"
"1631-10-15 20:04 UTC","31-10-15 20:04:10 +0000"

View File

@ -0,0 +1,14 @@
@RELATION test9
@ATTRIBUTE attr_date_number RELATIONAL
@ATTRIBUTE attr_date DATE "yyyy-MM-dd"
@ATTRIBUTE attr_number INTEGER
@END attr_date_number
@DATA
"1999-01-31 1\n1935-11-27 10"
"2004-12-01 2\n1942-08-13 20"
"1817-04-28 3"
"2100-09-10 4\n1957-04-17 40\n1721-01-14 400"
"2013-11-30 5"
"1631-10-15 6"

View File

@ -0,0 +1,421 @@
import datetime
import os
import sys
from os.path import join as pjoin
from io import StringIO
import numpy as np
from numpy.testing import (assert_array_almost_equal,
assert_array_equal, assert_equal, assert_)
from pytest import raises as assert_raises
from scipy.io.arff import loadarff
from scipy.io.arff._arffread import read_header, ParseArffError
data_path = pjoin(os.path.dirname(__file__), 'data')
test1 = pjoin(data_path, 'test1.arff')
test2 = pjoin(data_path, 'test2.arff')
test3 = pjoin(data_path, 'test3.arff')
test4 = pjoin(data_path, 'test4.arff')
test5 = pjoin(data_path, 'test5.arff')
test6 = pjoin(data_path, 'test6.arff')
test7 = pjoin(data_path, 'test7.arff')
test8 = pjoin(data_path, 'test8.arff')
test9 = pjoin(data_path, 'test9.arff')
test10 = pjoin(data_path, 'test10.arff')
test11 = pjoin(data_path, 'test11.arff')
test_quoted_nominal = pjoin(data_path, 'quoted_nominal.arff')
test_quoted_nominal_spaces = pjoin(data_path, 'quoted_nominal_spaces.arff')
expect4_data = [(0.1, 0.2, 0.3, 0.4, 'class1'),
(-0.1, -0.2, -0.3, -0.4, 'class2'),
(1, 2, 3, 4, 'class3')]
expected_types = ['numeric', 'numeric', 'numeric', 'numeric', 'nominal']
missing = pjoin(data_path, 'missing.arff')
expect_missing_raw = np.array([[1, 5], [2, 4], [np.nan, np.nan]])
expect_missing = np.empty(3, [('yop', float), ('yap', float)])
expect_missing['yop'] = expect_missing_raw[:, 0]
expect_missing['yap'] = expect_missing_raw[:, 1]
class TestData:
def test1(self):
# Parsing trivial file with nothing.
self._test(test4)
def test2(self):
# Parsing trivial file with some comments in the data section.
self._test(test5)
def test3(self):
# Parsing trivial file with nominal attribute of 1 character.
self._test(test6)
def test4(self):
# Parsing trivial file with trailing spaces in attribute declaration.
self._test(test11)
def _test(self, test_file):
data, meta = loadarff(test_file)
for i in range(len(data)):
for j in range(4):
assert_array_almost_equal(expect4_data[i][j], data[i][j])
assert_equal(meta.types(), expected_types)
def test_filelike(self):
# Test reading from file-like object (StringIO)
with open(test1) as f1:
data1, meta1 = loadarff(f1)
with open(test1) as f2:
data2, meta2 = loadarff(StringIO(f2.read()))
assert_(data1 == data2)
assert_(repr(meta1) == repr(meta2))
def test_path(self):
# Test reading from `pathlib.Path` object
from pathlib import Path
with open(test1) as f1:
data1, meta1 = loadarff(f1)
data2, meta2 = loadarff(Path(test1))
assert_(data1 == data2)
assert_(repr(meta1) == repr(meta2))
class TestMissingData:
def test_missing(self):
data, meta = loadarff(missing)
for i in ['yop', 'yap']:
assert_array_almost_equal(data[i], expect_missing[i])
class TestNoData:
def test_nodata(self):
# The file nodata.arff has no data in the @DATA section.
# Reading it should result in an array with length 0.
nodata_filename = os.path.join(data_path, 'nodata.arff')
data, meta = loadarff(nodata_filename)
if sys.byteorder == 'big':
end = '>'
else:
end = '<'
expected_dtype = np.dtype([('sepallength', f'{end}f8'),
('sepalwidth', f'{end}f8'),
('petallength', f'{end}f8'),
('petalwidth', f'{end}f8'),
('class', 'S15')])
assert_equal(data.dtype, expected_dtype)
assert_equal(data.size, 0)
class TestHeader:
def test_type_parsing(self):
# Test parsing type of attribute from their value.
with open(test2) as ofile:
rel, attrs = read_header(ofile)
expected = ['numeric', 'numeric', 'numeric', 'numeric', 'numeric',
'numeric', 'string', 'string', 'nominal', 'nominal']
for i in range(len(attrs)):
assert_(attrs[i].type_name == expected[i])
def test_badtype_parsing(self):
# Test parsing wrong type of attribute from their value.
def badtype_read():
with open(test3) as ofile:
_, _ = read_header(ofile)
assert_raises(ParseArffError, badtype_read)
def test_fullheader1(self):
# Parsing trivial header with nothing.
with open(test1) as ofile:
rel, attrs = read_header(ofile)
# Test relation
assert_(rel == 'test1')
# Test numerical attributes
assert_(len(attrs) == 5)
for i in range(4):
assert_(attrs[i].name == 'attr%d' % i)
assert_(attrs[i].type_name == 'numeric')
# Test nominal attribute
assert_(attrs[4].name == 'class')
assert_(attrs[4].values == ('class0', 'class1', 'class2', 'class3'))
def test_dateheader(self):
with open(test7) as ofile:
rel, attrs = read_header(ofile)
assert_(rel == 'test7')
assert_(len(attrs) == 5)
assert_(attrs[0].name == 'attr_year')
assert_(attrs[0].date_format == '%Y')
assert_(attrs[1].name == 'attr_month')
assert_(attrs[1].date_format == '%Y-%m')
assert_(attrs[2].name == 'attr_date')
assert_(attrs[2].date_format == '%Y-%m-%d')
assert_(attrs[3].name == 'attr_datetime_local')
assert_(attrs[3].date_format == '%Y-%m-%d %H:%M')
assert_(attrs[4].name == 'attr_datetime_missing')
assert_(attrs[4].date_format == '%Y-%m-%d %H:%M')
def test_dateheader_unsupported(self):
def read_dateheader_unsupported():
with open(test8) as ofile:
_, _ = read_header(ofile)
assert_raises(ValueError, read_dateheader_unsupported)
class TestDateAttribute:
def setup_method(self):
self.data, self.meta = loadarff(test7)
def test_year_attribute(self):
expected = np.array([
'1999',
'2004',
'1817',
'2100',
'2013',
'1631'
], dtype='datetime64[Y]')
assert_array_equal(self.data["attr_year"], expected)
def test_month_attribute(self):
expected = np.array([
'1999-01',
'2004-12',
'1817-04',
'2100-09',
'2013-11',
'1631-10'
], dtype='datetime64[M]')
assert_array_equal(self.data["attr_month"], expected)
def test_date_attribute(self):
expected = np.array([
'1999-01-31',
'2004-12-01',
'1817-04-28',
'2100-09-10',
'2013-11-30',
'1631-10-15'
], dtype='datetime64[D]')
assert_array_equal(self.data["attr_date"], expected)
def test_datetime_local_attribute(self):
expected = np.array([
datetime.datetime(year=1999, month=1, day=31, hour=0, minute=1),
datetime.datetime(year=2004, month=12, day=1, hour=23, minute=59),
datetime.datetime(year=1817, month=4, day=28, hour=13, minute=0),
datetime.datetime(year=2100, month=9, day=10, hour=12, minute=0),
datetime.datetime(year=2013, month=11, day=30, hour=4, minute=55),
datetime.datetime(year=1631, month=10, day=15, hour=20, minute=4)
], dtype='datetime64[m]')
assert_array_equal(self.data["attr_datetime_local"], expected)
def test_datetime_missing(self):
expected = np.array([
'nat',
'2004-12-01T23:59',
'nat',
'nat',
'2013-11-30T04:55',
'1631-10-15T20:04'
], dtype='datetime64[m]')
assert_array_equal(self.data["attr_datetime_missing"], expected)
def test_datetime_timezone(self):
assert_raises(ParseArffError, loadarff, test8)
class TestRelationalAttribute:
def setup_method(self):
self.data, self.meta = loadarff(test9)
def test_attributes(self):
assert_equal(len(self.meta._attributes), 1)
relational = list(self.meta._attributes.values())[0]
assert_equal(relational.name, 'attr_date_number')
assert_equal(relational.type_name, 'relational')
assert_equal(len(relational.attributes), 2)
assert_equal(relational.attributes[0].name,
'attr_date')
assert_equal(relational.attributes[0].type_name,
'date')
assert_equal(relational.attributes[1].name,
'attr_number')
assert_equal(relational.attributes[1].type_name,
'numeric')
def test_data(self):
dtype_instance = [('attr_date', 'datetime64[D]'),
('attr_number', np.float64)]
expected = [
np.array([('1999-01-31', 1), ('1935-11-27', 10)],
dtype=dtype_instance),
np.array([('2004-12-01', 2), ('1942-08-13', 20)],
dtype=dtype_instance),
np.array([('1817-04-28', 3)],
dtype=dtype_instance),
np.array([('2100-09-10', 4), ('1957-04-17', 40),
('1721-01-14', 400)],
dtype=dtype_instance),
np.array([('2013-11-30', 5)],
dtype=dtype_instance),
np.array([('1631-10-15', 6)],
dtype=dtype_instance)
]
for i in range(len(self.data["attr_date_number"])):
assert_array_equal(self.data["attr_date_number"][i],
expected[i])
class TestRelationalAttributeLong:
def setup_method(self):
self.data, self.meta = loadarff(test10)
def test_attributes(self):
assert_equal(len(self.meta._attributes), 1)
relational = list(self.meta._attributes.values())[0]
assert_equal(relational.name, 'attr_relational')
assert_equal(relational.type_name, 'relational')
assert_equal(len(relational.attributes), 1)
assert_equal(relational.attributes[0].name,
'attr_number')
assert_equal(relational.attributes[0].type_name, 'numeric')
def test_data(self):
dtype_instance = [('attr_number', np.float64)]
expected = np.array([(n,) for n in range(30000)],
dtype=dtype_instance)
assert_array_equal(self.data["attr_relational"][0],
expected)
class TestQuotedNominal:
"""
Regression test for issue #10232:
Exception in loadarff with quoted nominal attributes.
"""
def setup_method(self):
self.data, self.meta = loadarff(test_quoted_nominal)
def test_attributes(self):
assert_equal(len(self.meta._attributes), 2)
age, smoker = self.meta._attributes.values()
assert_equal(age.name, 'age')
assert_equal(age.type_name, 'numeric')
assert_equal(smoker.name, 'smoker')
assert_equal(smoker.type_name, 'nominal')
assert_equal(smoker.values, ['yes', 'no'])
def test_data(self):
age_dtype_instance = np.float64
smoker_dtype_instance = '<S3'
age_expected = np.array([
18,
24,
44,
56,
89,
11,
], dtype=age_dtype_instance)
smoker_expected = np.array([
'no',
'yes',
'no',
'no',
'yes',
'no',
], dtype=smoker_dtype_instance)
assert_array_equal(self.data["age"], age_expected)
assert_array_equal(self.data["smoker"], smoker_expected)
class TestQuotedNominalSpaces:
"""
Regression test for issue #10232:
Exception in loadarff with quoted nominal attributes.
"""
def setup_method(self):
self.data, self.meta = loadarff(test_quoted_nominal_spaces)
def test_attributes(self):
assert_equal(len(self.meta._attributes), 2)
age, smoker = self.meta._attributes.values()
assert_equal(age.name, 'age')
assert_equal(age.type_name, 'numeric')
assert_equal(smoker.name, 'smoker')
assert_equal(smoker.type_name, 'nominal')
assert_equal(smoker.values, [' yes', 'no '])
def test_data(self):
age_dtype_instance = np.float64
smoker_dtype_instance = '<S5'
age_expected = np.array([
18,
24,
44,
56,
89,
11,
], dtype=age_dtype_instance)
smoker_expected = np.array([
'no ',
' yes',
'no ',
'no ',
' yes',
'no ',
], dtype=smoker_dtype_instance)
assert_array_equal(self.data["age"], age_expected)
assert_array_equal(self.data["smoker"], smoker_expected)

View File

@ -0,0 +1,17 @@
# This file is not meant for public use and will be removed in SciPy v2.0.0.
# Use the `scipy.io` namespace for importing the functions
# included below.
from scipy._lib.deprecation import _sub_module_deprecation
__all__ = ["hb_read", "hb_write"] # noqa: F822
def __dir__():
return __all__
def __getattr__(name):
return _sub_module_deprecation(sub_package="io", module="harwell_boeing",
private_modules=["_harwell_boeing"], all=__all__,
attribute=name)

View File

@ -0,0 +1,17 @@
# This file is not meant for public use and will be removed in SciPy v2.0.0.
# Use the `scipy.io` namespace for importing the functions
# included below.
from scipy._lib.deprecation import _sub_module_deprecation
__all__ = ["readsav"] # noqa: F822
def __dir__():
return __all__
def __getattr__(name):
return _sub_module_deprecation(sub_package="io", module="idl",
private_modules=["_idl"], all=__all__,
attribute=name)

View File

@ -0,0 +1,63 @@
"""
MATLAB® file utilities (:mod:`scipy.io.matlab`)
===============================================
.. currentmodule:: scipy.io.matlab
This submodule is meant to provide lower-level file utilities related to reading
and writing MATLAB files.
.. autosummary::
:toctree: generated/
matfile_version - Get the MATLAB file version
MatReadError - Exception indicating a read issue
MatReadWarning - Warning class for read issues
MatWriteError - Exception indicating a write issue
mat_struct - Class used when ``struct_as_record=False``
.. autosummary::
:toctree: generated/
:template: autosummary/ndarray_subclass.rst
:nosignatures:
MatlabObject - Class for a MATLAB object
MatlabOpaque - Class for a MATLAB opaque matrix
MatlabFunction - Class for a MATLAB function object
The following utilities that live in the :mod:`scipy.io`
namespace also exist in this namespace:
.. autosummary::
:toctree: generated/
loadmat - Read a MATLAB style mat file (version 4 through 7.1)
savemat - Write a MATLAB style mat file (version 4 through 7.1)
whosmat - List contents of a MATLAB style mat file (version 4 through 7.1)
Notes
-----
MATLAB(R) is a registered trademark of The MathWorks, Inc., 3 Apple Hill
Drive, Natick, MA 01760-2098, USA.
"""
# Matlab file read and write utilities
from ._mio import loadmat, savemat, whosmat
from ._mio5 import MatlabFunction
from ._mio5_params import MatlabObject, MatlabOpaque, mat_struct
from ._miobase import (matfile_version, MatReadError, MatReadWarning,
MatWriteError)
# Deprecated namespaces, to be removed in v2.0.0
from .import (mio, mio5, mio5_params, mio4, byteordercodes,
miobase, mio_utils, streams, mio5_utils)
__all__ = [
'loadmat', 'savemat', 'whosmat', 'MatlabObject',
'matfile_version', 'MatReadError', 'MatReadWarning',
'MatWriteError', 'mat_struct', 'MatlabOpaque', 'MatlabFunction'
]
from scipy._lib._testutils import PytestTester
test = PytestTester(__name__)
del PytestTester

View File

@ -0,0 +1,75 @@
''' Byteorder utilities for system - numpy byteorder encoding
Converts a variety of string codes for little endian, big endian,
native byte order and swapped byte order to explicit NumPy endian
codes - one of '<' (little endian) or '>' (big endian)
'''
import sys
__all__ = [
'aliases', 'native_code', 'swapped_code',
'sys_is_le', 'to_numpy_code'
]
sys_is_le = sys.byteorder == 'little'
native_code = sys_is_le and '<' or '>'
swapped_code = sys_is_le and '>' or '<'
aliases = {'little': ('little', '<', 'l', 'le'),
'big': ('big', '>', 'b', 'be'),
'native': ('native', '='),
'swapped': ('swapped', 'S')}
def to_numpy_code(code):
"""
Convert various order codings to NumPy format.
Parameters
----------
code : str
The code to convert. It is converted to lower case before parsing.
Legal values are:
'little', 'big', 'l', 'b', 'le', 'be', '<', '>', 'native', '=',
'swapped', 's'.
Returns
-------
out_code : {'<', '>'}
Here '<' is the numpy dtype code for little endian,
and '>' is the code for big endian.
Examples
--------
>>> import sys
>>> from scipy.io.matlab._byteordercodes import to_numpy_code
>>> sys_is_le = (sys.byteorder == 'little')
>>> sys_is_le
True
>>> to_numpy_code('big')
'>'
>>> to_numpy_code('little')
'<'
>>> nc = to_numpy_code('native')
>>> nc == '<' if sys_is_le else nc == '>'
True
>>> sc = to_numpy_code('swapped')
>>> sc == '>' if sys_is_le else sc == '<'
True
"""
code = code.lower()
if code is None:
return native_code
if code in aliases['little']:
return '<'
elif code in aliases['big']:
return '>'
elif code in aliases['native']:
return native_code
elif code in aliases['swapped']:
return swapped_code
else:
raise ValueError(
'We cannot handle byte order %s' % code)

View File

@ -0,0 +1,359 @@
"""
Module for reading and writing matlab (TM) .mat files
"""
# Authors: Travis Oliphant, Matthew Brett
from contextlib import contextmanager
from ._miobase import _get_matfile_version, docfiller
from ._mio4 import MatFile4Reader, MatFile4Writer
from ._mio5 import MatFile5Reader, MatFile5Writer
__all__ = ['loadmat', 'savemat', 'whosmat']
@contextmanager
def _open_file_context(file_like, appendmat, mode='rb'):
f, opened = _open_file(file_like, appendmat, mode)
try:
yield f
finally:
if opened:
f.close()
def _open_file(file_like, appendmat, mode='rb'):
"""
Open `file_like` and return as file-like object. First, check if object is
already file-like; if so, return it as-is. Otherwise, try to pass it
to open(). If that fails, and `file_like` is a string, and `appendmat` is true,
append '.mat' and try again.
"""
reqs = {'read'} if set(mode) & set('r+') else set()
if set(mode) & set('wax+'):
reqs.add('write')
if reqs.issubset(dir(file_like)):
return file_like, False
try:
return open(file_like, mode), True
except OSError as e:
# Probably "not found"
if isinstance(file_like, str):
if appendmat and not file_like.endswith('.mat'):
file_like += '.mat'
return open(file_like, mode), True
else:
raise OSError(
'Reader needs file name or open file-like object'
) from e
@docfiller
def mat_reader_factory(file_name, appendmat=True, **kwargs):
"""
Create reader for matlab .mat format files.
Parameters
----------
%(file_arg)s
%(append_arg)s
%(load_args)s
%(struct_arg)s
Returns
-------
matreader : MatFileReader object
Initialized instance of MatFileReader class matching the mat file
type detected in `filename`.
file_opened : bool
Whether the file was opened by this routine.
"""
byte_stream, file_opened = _open_file(file_name, appendmat)
mjv, mnv = _get_matfile_version(byte_stream)
if mjv == 0:
return MatFile4Reader(byte_stream, **kwargs), file_opened
elif mjv == 1:
return MatFile5Reader(byte_stream, **kwargs), file_opened
elif mjv == 2:
raise NotImplementedError('Please use HDF reader for matlab v7.3 '
'files, e.g. h5py')
else:
raise TypeError('Did not recognize version %s' % mjv)
@docfiller
def loadmat(file_name, mdict=None, appendmat=True, **kwargs):
"""
Load MATLAB file.
Parameters
----------
file_name : str
Name of the mat file (do not need .mat extension if
appendmat==True). Can also pass open file-like object.
mdict : dict, optional
Dictionary in which to insert matfile variables.
appendmat : bool, optional
True to append the .mat extension to the end of the given
filename, if not already present. Default is True.
byte_order : str or None, optional
None by default, implying byte order guessed from mat
file. Otherwise can be one of ('native', '=', 'little', '<',
'BIG', '>').
mat_dtype : bool, optional
If True, return arrays in same dtype as would be loaded into
MATLAB (instead of the dtype with which they are saved).
squeeze_me : bool, optional
Whether to squeeze unit matrix dimensions or not.
chars_as_strings : bool, optional
Whether to convert char arrays to string arrays.
matlab_compatible : bool, optional
Returns matrices as would be loaded by MATLAB (implies
squeeze_me=False, chars_as_strings=False, mat_dtype=True,
struct_as_record=True).
struct_as_record : bool, optional
Whether to load MATLAB structs as NumPy record arrays, or as
old-style NumPy arrays with dtype=object. Setting this flag to
False replicates the behavior of scipy version 0.7.x (returning
NumPy object arrays). The default setting is True, because it
allows easier round-trip load and save of MATLAB files.
verify_compressed_data_integrity : bool, optional
Whether the length of compressed sequences in the MATLAB file
should be checked, to ensure that they are not longer than we expect.
It is advisable to enable this (the default) because overlong
compressed sequences in MATLAB files generally indicate that the
files have experienced some sort of corruption.
variable_names : None or sequence
If None (the default) - read all variables in file. Otherwise,
`variable_names` should be a sequence of strings, giving names of the
MATLAB variables to read from the file. The reader will skip any
variable with a name not in this sequence, possibly saving some read
processing.
simplify_cells : False, optional
If True, return a simplified dict structure (which is useful if the mat
file contains cell arrays). Note that this only affects the structure
of the result and not its contents (which is identical for both output
structures). If True, this automatically sets `struct_as_record` to
False and `squeeze_me` to True, which is required to simplify cells.
Returns
-------
mat_dict : dict
dictionary with variable names as keys, and loaded matrices as
values.
Notes
-----
v4 (Level 1.0), v6 and v7 to 7.2 matfiles are supported.
You will need an HDF5 Python library to read MATLAB 7.3 format mat
files. Because SciPy does not supply one, we do not implement the
HDF5 / 7.3 interface here.
Examples
--------
>>> from os.path import dirname, join as pjoin
>>> import scipy.io as sio
Get the filename for an example .mat file from the tests/data directory.
>>> data_dir = pjoin(dirname(sio.__file__), 'matlab', 'tests', 'data')
>>> mat_fname = pjoin(data_dir, 'testdouble_7.4_GLNX86.mat')
Load the .mat file contents.
>>> mat_contents = sio.loadmat(mat_fname)
The result is a dictionary, one key/value pair for each variable:
>>> sorted(mat_contents.keys())
['__globals__', '__header__', '__version__', 'testdouble']
>>> mat_contents['testdouble']
array([[0. , 0.78539816, 1.57079633, 2.35619449, 3.14159265,
3.92699082, 4.71238898, 5.49778714, 6.28318531]])
By default SciPy reads MATLAB structs as structured NumPy arrays where the
dtype fields are of type `object` and the names correspond to the MATLAB
struct field names. This can be disabled by setting the optional argument
`struct_as_record=False`.
Get the filename for an example .mat file that contains a MATLAB struct
called `teststruct` and load the contents.
>>> matstruct_fname = pjoin(data_dir, 'teststruct_7.4_GLNX86.mat')
>>> matstruct_contents = sio.loadmat(matstruct_fname)
>>> teststruct = matstruct_contents['teststruct']
>>> teststruct.dtype
dtype([('stringfield', 'O'), ('doublefield', 'O'), ('complexfield', 'O')])
The size of the structured array is the size of the MATLAB struct, not the
number of elements in any particular field. The shape defaults to 2-D
unless the optional argument `squeeze_me=True`, in which case all length 1
dimensions are removed.
>>> teststruct.size
1
>>> teststruct.shape
(1, 1)
Get the 'stringfield' of the first element in the MATLAB struct.
>>> teststruct[0, 0]['stringfield']
array(['Rats live on no evil star.'],
dtype='<U26')
Get the first element of the 'doublefield'.
>>> teststruct['doublefield'][0, 0]
array([[ 1.41421356, 2.71828183, 3.14159265]])
Load the MATLAB struct, squeezing out length 1 dimensions, and get the item
from the 'complexfield'.
>>> matstruct_squeezed = sio.loadmat(matstruct_fname, squeeze_me=True)
>>> matstruct_squeezed['teststruct'].shape
()
>>> matstruct_squeezed['teststruct']['complexfield'].shape
()
>>> matstruct_squeezed['teststruct']['complexfield'].item()
array([ 1.41421356+1.41421356j, 2.71828183+2.71828183j,
3.14159265+3.14159265j])
"""
variable_names = kwargs.pop('variable_names', None)
with _open_file_context(file_name, appendmat) as f:
MR, _ = mat_reader_factory(f, **kwargs)
matfile_dict = MR.get_variables(variable_names)
if mdict is not None:
mdict.update(matfile_dict)
else:
mdict = matfile_dict
return mdict
@docfiller
def savemat(file_name, mdict,
appendmat=True,
format='5',
long_field_names=False,
do_compression=False,
oned_as='row'):
"""
Save a dictionary of names and arrays into a MATLAB-style .mat file.
This saves the array objects in the given dictionary to a MATLAB-
style .mat file.
Parameters
----------
file_name : str or file-like object
Name of the .mat file (.mat extension not needed if ``appendmat ==
True``).
Can also pass open file_like object.
mdict : dict
Dictionary from which to save matfile variables.
appendmat : bool, optional
True (the default) to append the .mat extension to the end of the
given filename, if not already present.
format : {'5', '4'}, string, optional
'5' (the default) for MATLAB 5 and up (to 7.2),
'4' for MATLAB 4 .mat files.
long_field_names : bool, optional
False (the default) - maximum field name length in a structure is
31 characters which is the documented maximum length.
True - maximum field name length in a structure is 63 characters
which works for MATLAB 7.6+.
do_compression : bool, optional
Whether or not to compress matrices on write. Default is False.
oned_as : {'row', 'column'}, optional
If 'column', write 1-D NumPy arrays as column vectors.
If 'row', write 1-D NumPy arrays as row vectors.
Examples
--------
>>> from scipy.io import savemat
>>> import numpy as np
>>> a = np.arange(20)
>>> mdic = {"a": a, "label": "experiment"}
>>> mdic
{'a': array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19]),
'label': 'experiment'}
>>> savemat("matlab_matrix.mat", mdic)
"""
with _open_file_context(file_name, appendmat, 'wb') as file_stream:
if format == '4':
if long_field_names:
message = "Long field names are not available for version 4 files"
raise ValueError(message)
MW = MatFile4Writer(file_stream, oned_as)
elif format == '5':
MW = MatFile5Writer(file_stream,
do_compression=do_compression,
unicode_strings=True,
long_field_names=long_field_names,
oned_as=oned_as)
else:
raise ValueError("Format should be '4' or '5'")
MW.put_variables(mdict)
@docfiller
def whosmat(file_name, appendmat=True, **kwargs):
"""
List variables inside a MATLAB file.
Parameters
----------
%(file_arg)s
%(append_arg)s
%(load_args)s
%(struct_arg)s
Returns
-------
variables : list of tuples
A list of tuples, where each tuple holds the matrix name (a string),
its shape (tuple of ints), and its data class (a string).
Possible data classes are: int8, uint8, int16, uint16, int32, uint32,
int64, uint64, single, double, cell, struct, object, char, sparse,
function, opaque, logical, unknown.
Notes
-----
v4 (Level 1.0), v6 and v7 to 7.2 matfiles are supported.
You will need an HDF5 python library to read matlab 7.3 format mat
files (e.g. h5py). Because SciPy does not supply one, we do not implement the
HDF5 / 7.3 interface here.
.. versionadded:: 0.12.0
Examples
--------
>>> from io import BytesIO
>>> import numpy as np
>>> from scipy.io import savemat, whosmat
Create some arrays, and use `savemat` to write them to a ``BytesIO``
instance.
>>> a = np.array([[10, 20, 30], [11, 21, 31]], dtype=np.int32)
>>> b = np.geomspace(1, 10, 5)
>>> f = BytesIO()
>>> savemat(f, {'a': a, 'b': b})
Use `whosmat` to inspect ``f``. Each tuple in the output list gives
the name, shape and data type of the array in ``f``.
>>> whosmat(f)
[('a', (2, 3), 'int32'), ('b', (1, 5), 'double')]
"""
with _open_file_context(file_name, appendmat) as f:
ML, file_opened = mat_reader_factory(f, **kwargs)
variables = ML.list_variables()
return variables

View File

@ -0,0 +1,625 @@
''' Classes for read / write of matlab (TM) 4 files
'''
import sys
import warnings
import math
import numpy as np
import scipy.sparse
from ._miobase import (MatFileReader, docfiller, matdims, read_dtype,
convert_dtypes, arr_to_chars, arr_dtype_number)
from ._mio_utils import squeeze_element, chars_to_strings
from functools import reduce
__all__ = [
'MatFile4Reader', 'MatFile4Writer', 'SYS_LITTLE_ENDIAN',
'VarHeader4', 'VarReader4', 'VarWriter4', 'arr_to_2d', 'mclass_info',
'mdtypes_template', 'miDOUBLE', 'miINT16', 'miINT32', 'miSINGLE',
'miUINT16', 'miUINT8', 'mxCHAR_CLASS', 'mxFULL_CLASS', 'mxSPARSE_CLASS',
'np_to_mtypes', 'order_codes'
]
SYS_LITTLE_ENDIAN = sys.byteorder == 'little'
miDOUBLE = 0
miSINGLE = 1
miINT32 = 2
miINT16 = 3
miUINT16 = 4
miUINT8 = 5
mdtypes_template = {
miDOUBLE: 'f8',
miSINGLE: 'f4',
miINT32: 'i4',
miINT16: 'i2',
miUINT16: 'u2',
miUINT8: 'u1',
'header': [('mopt', 'i4'),
('mrows', 'i4'),
('ncols', 'i4'),
('imagf', 'i4'),
('namlen', 'i4')],
'U1': 'U1',
}
np_to_mtypes = {
'f8': miDOUBLE,
'c32': miDOUBLE,
'c24': miDOUBLE,
'c16': miDOUBLE,
'f4': miSINGLE,
'c8': miSINGLE,
'i4': miINT32,
'i2': miINT16,
'u2': miUINT16,
'u1': miUINT8,
'S1': miUINT8,
}
# matrix classes
mxFULL_CLASS = 0
mxCHAR_CLASS = 1
mxSPARSE_CLASS = 2
order_codes = {
0: '<',
1: '>',
2: 'VAX D-float', # !
3: 'VAX G-float',
4: 'Cray', # !!
}
mclass_info = {
mxFULL_CLASS: 'double',
mxCHAR_CLASS: 'char',
mxSPARSE_CLASS: 'sparse',
}
class VarHeader4:
# Mat4 variables never logical or global
is_logical = False
is_global = False
def __init__(self,
name,
dtype,
mclass,
dims,
is_complex):
self.name = name
self.dtype = dtype
self.mclass = mclass
self.dims = dims
self.is_complex = is_complex
class VarReader4:
''' Class to read matlab 4 variables '''
def __init__(self, file_reader):
self.file_reader = file_reader
self.mat_stream = file_reader.mat_stream
self.dtypes = file_reader.dtypes
self.chars_as_strings = file_reader.chars_as_strings
self.squeeze_me = file_reader.squeeze_me
def read_header(self):
''' Read and return header for variable '''
data = read_dtype(self.mat_stream, self.dtypes['header'])
name = self.mat_stream.read(int(data['namlen'])).strip(b'\x00')
if data['mopt'] < 0 or data['mopt'] > 5000:
raise ValueError('Mat 4 mopt wrong format, byteswapping problem?')
M, rest = divmod(data['mopt'], 1000) # order code
if M not in (0, 1):
warnings.warn("We do not support byte ordering '%s'; returned "
"data may be corrupt" % order_codes[M],
UserWarning, stacklevel=3)
O, rest = divmod(rest, 100) # unused, should be 0
if O != 0:
raise ValueError('O in MOPT integer should be 0, wrong format?')
P, rest = divmod(rest, 10) # data type code e.g miDOUBLE (see above)
T = rest # matrix type code e.g., mxFULL_CLASS (see above)
dims = (data['mrows'], data['ncols'])
is_complex = data['imagf'] == 1
dtype = self.dtypes[P]
return VarHeader4(
name,
dtype,
T,
dims,
is_complex)
def array_from_header(self, hdr, process=True):
mclass = hdr.mclass
if mclass == mxFULL_CLASS:
arr = self.read_full_array(hdr)
elif mclass == mxCHAR_CLASS:
arr = self.read_char_array(hdr)
if process and self.chars_as_strings:
arr = chars_to_strings(arr)
elif mclass == mxSPARSE_CLASS:
# no current processing (below) makes sense for sparse
return self.read_sparse_array(hdr)
else:
raise TypeError('No reader for class code %s' % mclass)
if process and self.squeeze_me:
return squeeze_element(arr)
return arr
def read_sub_array(self, hdr, copy=True):
''' Mat4 read using header `hdr` dtype and dims
Parameters
----------
hdr : object
object with attributes ``dtype``, ``dims``. dtype is assumed to be
the correct endianness
copy : bool, optional
copies array before return if True (default True)
(buffer is usually read only)
Returns
-------
arr : ndarray
of dtype given by `hdr` ``dtype`` and shape given by `hdr` ``dims``
'''
dt = hdr.dtype
dims = hdr.dims
num_bytes = dt.itemsize
for d in dims:
num_bytes *= d
buffer = self.mat_stream.read(int(num_bytes))
if len(buffer) != num_bytes:
raise ValueError("Not enough bytes to read matrix '%s'; is this "
"a badly-formed file? Consider listing matrices "
"with `whosmat` and loading named matrices with "
"`variable_names` kwarg to `loadmat`" % hdr.name)
arr = np.ndarray(shape=dims,
dtype=dt,
buffer=buffer,
order='F')
if copy:
arr = arr.copy()
return arr
def read_full_array(self, hdr):
''' Full (rather than sparse) matrix getter
Read matrix (array) can be real or complex
Parameters
----------
hdr : ``VarHeader4`` instance
Returns
-------
arr : ndarray
complex array if ``hdr.is_complex`` is True, otherwise a real
numeric array
'''
if hdr.is_complex:
# avoid array copy to save memory
res = self.read_sub_array(hdr, copy=False)
res_j = self.read_sub_array(hdr, copy=False)
return res + (res_j * 1j)
return self.read_sub_array(hdr)
def read_char_array(self, hdr):
''' latin-1 text matrix (char matrix) reader
Parameters
----------
hdr : ``VarHeader4`` instance
Returns
-------
arr : ndarray
with dtype 'U1', shape given by `hdr` ``dims``
'''
arr = self.read_sub_array(hdr).astype(np.uint8)
S = arr.tobytes().decode('latin-1')
return np.ndarray(shape=hdr.dims,
dtype=np.dtype('U1'),
buffer=np.array(S)).copy()
def read_sparse_array(self, hdr):
''' Read and return sparse matrix type
Parameters
----------
hdr : ``VarHeader4`` instance
Returns
-------
arr : ``scipy.sparse.coo_matrix``
with dtype ``float`` and shape read from the sparse matrix data
Notes
-----
MATLAB 4 real sparse arrays are saved in a N+1 by 3 array format, where
N is the number of non-zero values. Column 1 values [0:N] are the
(1-based) row indices of the each non-zero value, column 2 [0:N] are the
column indices, column 3 [0:N] are the (real) values. The last values
[-1,0:2] of the rows, column indices are shape[0] and shape[1]
respectively of the output matrix. The last value for the values column
is a padding 0. mrows and ncols values from the header give the shape of
the stored matrix, here [N+1, 3]. Complex data are saved as a 4 column
matrix, where the fourth column contains the imaginary component; the
last value is again 0. Complex sparse data do *not* have the header
``imagf`` field set to True; the fact that the data are complex is only
detectable because there are 4 storage columns.
'''
res = self.read_sub_array(hdr)
tmp = res[:-1,:]
# All numbers are float64 in Matlab, but SciPy sparse expects int shape
dims = (int(res[-1,0]), int(res[-1,1]))
I = np.ascontiguousarray(tmp[:,0],dtype='intc') # fixes byte order also
J = np.ascontiguousarray(tmp[:,1],dtype='intc')
I -= 1 # for 1-based indexing
J -= 1
if res.shape[1] == 3:
V = np.ascontiguousarray(tmp[:,2],dtype='float')
else:
V = np.ascontiguousarray(tmp[:,2],dtype='complex')
V.imag = tmp[:,3]
return scipy.sparse.coo_matrix((V,(I,J)), dims)
def shape_from_header(self, hdr):
'''Read the shape of the array described by the header.
The file position after this call is unspecified.
'''
mclass = hdr.mclass
if mclass == mxFULL_CLASS:
shape = tuple(map(int, hdr.dims))
elif mclass == mxCHAR_CLASS:
shape = tuple(map(int, hdr.dims))
if self.chars_as_strings:
shape = shape[:-1]
elif mclass == mxSPARSE_CLASS:
dt = hdr.dtype
dims = hdr.dims
if not (len(dims) == 2 and dims[0] >= 1 and dims[1] >= 1):
return ()
# Read only the row and column counts
self.mat_stream.seek(dt.itemsize * (dims[0] - 1), 1)
rows = np.ndarray(shape=(), dtype=dt,
buffer=self.mat_stream.read(dt.itemsize))
self.mat_stream.seek(dt.itemsize * (dims[0] - 1), 1)
cols = np.ndarray(shape=(), dtype=dt,
buffer=self.mat_stream.read(dt.itemsize))
shape = (int(rows), int(cols))
else:
raise TypeError('No reader for class code %s' % mclass)
if self.squeeze_me:
shape = tuple([x for x in shape if x != 1])
return shape
class MatFile4Reader(MatFileReader):
''' Reader for Mat4 files '''
@docfiller
def __init__(self, mat_stream, *args, **kwargs):
''' Initialize matlab 4 file reader
%(matstream_arg)s
%(load_args)s
'''
super().__init__(mat_stream, *args, **kwargs)
self._matrix_reader = None
def guess_byte_order(self):
self.mat_stream.seek(0)
mopt = read_dtype(self.mat_stream, np.dtype('i4'))
self.mat_stream.seek(0)
if mopt == 0:
return '<'
if mopt < 0 or mopt > 5000:
# Number must have been byteswapped
return SYS_LITTLE_ENDIAN and '>' or '<'
# Not byteswapped
return SYS_LITTLE_ENDIAN and '<' or '>'
def initialize_read(self):
''' Run when beginning read of variables
Sets up readers from parameters in `self`
'''
self.dtypes = convert_dtypes(mdtypes_template, self.byte_order)
self._matrix_reader = VarReader4(self)
def read_var_header(self):
''' Read and return header, next position
Parameters
----------
None
Returns
-------
header : object
object that can be passed to self.read_var_array, and that
has attributes ``name`` and ``is_global``
next_position : int
position in stream of next variable
'''
hdr = self._matrix_reader.read_header()
n = reduce(lambda x, y: x*y, hdr.dims, 1) # fast product
remaining_bytes = hdr.dtype.itemsize * n
if hdr.is_complex and not hdr.mclass == mxSPARSE_CLASS:
remaining_bytes *= 2
next_position = self.mat_stream.tell() + remaining_bytes
return hdr, next_position
def read_var_array(self, header, process=True):
''' Read array, given `header`
Parameters
----------
header : header object
object with fields defining variable header
process : {True, False}, optional
If True, apply recursive post-processing during loading of array.
Returns
-------
arr : array
array with post-processing applied or not according to
`process`.
'''
return self._matrix_reader.array_from_header(header, process)
def get_variables(self, variable_names=None):
''' get variables from stream as dictionary
Parameters
----------
variable_names : None or str or sequence of str, optional
variable name, or sequence of variable names to get from Mat file /
file stream. If None, then get all variables in file.
'''
if isinstance(variable_names, str):
variable_names = [variable_names]
elif variable_names is not None:
variable_names = list(variable_names)
self.mat_stream.seek(0)
# set up variable reader
self.initialize_read()
mdict = {}
while not self.end_of_stream():
hdr, next_position = self.read_var_header()
name = 'None' if hdr.name is None else hdr.name.decode('latin1')
if variable_names is not None and name not in variable_names:
self.mat_stream.seek(next_position)
continue
mdict[name] = self.read_var_array(hdr)
self.mat_stream.seek(next_position)
if variable_names is not None:
variable_names.remove(name)
if len(variable_names) == 0:
break
return mdict
def list_variables(self):
''' list variables from stream '''
self.mat_stream.seek(0)
# set up variable reader
self.initialize_read()
vars = []
while not self.end_of_stream():
hdr, next_position = self.read_var_header()
name = 'None' if hdr.name is None else hdr.name.decode('latin1')
shape = self._matrix_reader.shape_from_header(hdr)
info = mclass_info.get(hdr.mclass, 'unknown')
vars.append((name, shape, info))
self.mat_stream.seek(next_position)
return vars
def arr_to_2d(arr, oned_as='row'):
''' Make ``arr`` exactly two dimensional
If `arr` has more than 2 dimensions, raise a ValueError
Parameters
----------
arr : array
oned_as : {'row', 'column'}, optional
Whether to reshape 1-D vectors as row vectors or column vectors.
See documentation for ``matdims`` for more detail
Returns
-------
arr2d : array
2-D version of the array
'''
dims = matdims(arr, oned_as)
if len(dims) > 2:
raise ValueError('Matlab 4 files cannot save arrays with more than '
'2 dimensions')
return arr.reshape(dims)
class VarWriter4:
def __init__(self, file_writer):
self.file_stream = file_writer.file_stream
self.oned_as = file_writer.oned_as
def write_bytes(self, arr):
self.file_stream.write(arr.tobytes(order='F'))
def write_string(self, s):
self.file_stream.write(s)
def write_header(self, name, shape, P=miDOUBLE, T=mxFULL_CLASS, imagf=0):
''' Write header for given data options
Parameters
----------
name : str
name of variable
shape : sequence
Shape of array as it will be read in matlab
P : int, optional
code for mat4 data type, one of ``miDOUBLE, miSINGLE, miINT32,
miINT16, miUINT16, miUINT8``
T : int, optional
code for mat4 matrix class, one of ``mxFULL_CLASS, mxCHAR_CLASS,
mxSPARSE_CLASS``
imagf : int, optional
flag indicating complex
'''
header = np.empty((), mdtypes_template['header'])
M = not SYS_LITTLE_ENDIAN
O = 0
header['mopt'] = (M * 1000 +
O * 100 +
P * 10 +
T)
header['mrows'] = shape[0]
header['ncols'] = shape[1]
header['imagf'] = imagf
header['namlen'] = len(name) + 1
self.write_bytes(header)
data = name + '\0'
self.write_string(data.encode('latin1'))
def write(self, arr, name):
''' Write matrix `arr`, with name `name`
Parameters
----------
arr : array_like
array to write
name : str
name in matlab workspace
'''
# we need to catch sparse first, because np.asarray returns an
# an object array for scipy.sparse
if scipy.sparse.issparse(arr):
self.write_sparse(arr, name)
return
arr = np.asarray(arr)
dt = arr.dtype
if not dt.isnative:
arr = arr.astype(dt.newbyteorder('='))
dtt = dt.type
if dtt is np.object_:
raise TypeError('Cannot save object arrays in Mat4')
elif dtt is np.void:
raise TypeError('Cannot save void type arrays')
elif dtt in (np.str_, np.bytes_):
self.write_char(arr, name)
return
self.write_numeric(arr, name)
def write_numeric(self, arr, name):
arr = arr_to_2d(arr, self.oned_as)
imagf = arr.dtype.kind == 'c'
try:
P = np_to_mtypes[arr.dtype.str[1:]]
except KeyError:
if imagf:
arr = arr.astype('c128')
else:
arr = arr.astype('f8')
P = miDOUBLE
self.write_header(name,
arr.shape,
P=P,
T=mxFULL_CLASS,
imagf=imagf)
if imagf:
self.write_bytes(arr.real)
self.write_bytes(arr.imag)
else:
self.write_bytes(arr)
def write_char(self, arr, name):
if arr.dtype.type == np.str_ and arr.dtype.itemsize != np.dtype('U1').itemsize:
arr = arr_to_chars(arr)
arr = arr_to_2d(arr, self.oned_as)
dims = arr.shape
self.write_header(
name,
dims,
P=miUINT8,
T=mxCHAR_CLASS)
if arr.dtype.kind == 'U':
# Recode unicode to latin1
n_chars = math.prod(dims)
st_arr = np.ndarray(shape=(),
dtype=arr_dtype_number(arr, n_chars),
buffer=arr)
st = st_arr.item().encode('latin-1')
arr = np.ndarray(shape=dims, dtype='S1', buffer=st)
self.write_bytes(arr)
def write_sparse(self, arr, name):
''' Sparse matrices are 2-D
See docstring for VarReader4.read_sparse_array
'''
A = arr.tocoo() # convert to sparse COO format (ijv)
imagf = A.dtype.kind == 'c'
ijv = np.zeros((A.nnz + 1, 3+imagf), dtype='f8')
ijv[:-1,0] = A.row
ijv[:-1,1] = A.col
ijv[:-1,0:2] += 1 # 1 based indexing
if imagf:
ijv[:-1,2] = A.data.real
ijv[:-1,3] = A.data.imag
else:
ijv[:-1,2] = A.data
ijv[-1,0:2] = A.shape
self.write_header(
name,
ijv.shape,
P=miDOUBLE,
T=mxSPARSE_CLASS)
self.write_bytes(ijv)
class MatFile4Writer:
''' Class for writing matlab 4 format files '''
def __init__(self, file_stream, oned_as=None):
self.file_stream = file_stream
if oned_as is None:
oned_as = 'row'
self.oned_as = oned_as
self._matrix_writer = None
def put_variables(self, mdict, write_header=None):
''' Write variables in `mdict` to stream
Parameters
----------
mdict : mapping
mapping with method ``items`` return name, contents pairs
where ``name`` which will appeak in the matlab workspace in
file load, and ``contents`` is something writeable to a
matlab file, such as a NumPy array.
write_header : {None, True, False}
If True, then write the matlab file header before writing the
variables. If None (the default) then write the file header
if we are at position 0 in the stream. By setting False
here, and setting the stream position to the end of the file,
you can append variables to a matlab file
'''
# there is no header for a matlab 4 mat file, so we ignore the
# ``write_header`` input argument. It's there for compatibility
# with the matlab 5 version of this method
self._matrix_writer = VarWriter4(self)
for name, var in mdict.items():
self._matrix_writer.write(var, name)

View File

@ -0,0 +1,895 @@
''' Classes for read / write of matlab (TM) 5 files
The matfile specification last found here:
https://www.mathworks.com/access/helpdesk/help/pdf_doc/matlab/matfile_format.pdf
(as of December 5 2008)
=================================
Note on functions and mat files
=================================
The document above does not give any hints as to the storage of matlab
function handles, or anonymous function handles. I had, therefore, to
guess the format of matlab arrays of ``mxFUNCTION_CLASS`` and
``mxOPAQUE_CLASS`` by looking at example mat files.
``mxFUNCTION_CLASS`` stores all types of matlab functions. It seems to
contain a struct matrix with a set pattern of fields. For anonymous
functions, a sub-fields of one of these fields seems to contain the
well-named ``mxOPAQUE_CLASS``. This seems to contain:
* array flags as for any matlab matrix
* 3 int8 strings
* a matrix
It seems that whenever the mat file contains a ``mxOPAQUE_CLASS``
instance, there is also an un-named matrix (name == '') at the end of
the mat file. I'll call this the ``__function_workspace__`` matrix.
When I saved two anonymous functions in a mat file, or appended another
anonymous function to the mat file, there was still only one
``__function_workspace__`` un-named matrix at the end, but larger than
that for a mat file with a single anonymous function, suggesting that
the workspaces for the two functions had been merged.
The ``__function_workspace__`` matrix appears to be of double class
(``mxCLASS_DOUBLE``), but stored as uint8, the memory for which is in
the format of a mini .mat file, without the first 124 bytes of the file
header (the description and the subsystem_offset), but with the version
U2 bytes, and the S2 endian test bytes. There follow 4 zero bytes,
presumably for 8 byte padding, and then a series of ``miMATRIX``
entries, as in a standard mat file. The ``miMATRIX`` entries appear to
be series of un-named (name == '') matrices, and may also contain arrays
of this same mini-mat format.
I guess that:
* saving an anonymous function back to a mat file will need the
associated ``__function_workspace__`` matrix saved as well for the
anonymous function to work correctly.
* appending to a mat file that has a ``__function_workspace__`` would
involve first pulling off this workspace, appending, checking whether
there were any more anonymous functions appended, and then somehow
merging the relevant workspaces, and saving at the end of the mat
file.
The mat files I was playing with are in ``tests/data``:
* sqr.mat
* parabola.mat
* some_functions.mat
See ``tests/test_mio.py:test_mio_funcs.py`` for the debugging
script I was working with.
Small fragments of current code adapted from matfile.py by Heiko
Henkelmann; parts of the code for simplify_cells=True adapted from
http://blog.nephics.com/2019/08/28/better-loadmat-for-scipy/.
'''
import math
import os
import time
import sys
import zlib
from io import BytesIO
import warnings
import numpy as np
import scipy.sparse
from ._byteordercodes import native_code, swapped_code
from ._miobase import (MatFileReader, docfiller, matdims, read_dtype,
arr_to_chars, arr_dtype_number, MatWriteError,
MatReadError, MatReadWarning)
# Reader object for matlab 5 format variables
from ._mio5_utils import VarReader5
# Constants and helper objects
from ._mio5_params import (MatlabObject, MatlabFunction, MDTYPES, NP_TO_MTYPES,
NP_TO_MXTYPES, miCOMPRESSED, miMATRIX, miINT8,
miUTF8, miUINT32, mxCELL_CLASS, mxSTRUCT_CLASS,
mxOBJECT_CLASS, mxCHAR_CLASS, mxSPARSE_CLASS,
mxDOUBLE_CLASS, mclass_info, mat_struct)
from ._streams import ZlibInputStream
def _has_struct(elem):
"""Determine if elem is an array and if first array item is a struct."""
return (isinstance(elem, np.ndarray) and (elem.size > 0) and (elem.ndim > 0) and
isinstance(elem[0], mat_struct))
def _inspect_cell_array(ndarray):
"""Construct lists from cell arrays (loaded as numpy ndarrays), recursing
into items if they contain mat_struct objects."""
elem_list = []
for sub_elem in ndarray:
if isinstance(sub_elem, mat_struct):
elem_list.append(_matstruct_to_dict(sub_elem))
elif _has_struct(sub_elem):
elem_list.append(_inspect_cell_array(sub_elem))
else:
elem_list.append(sub_elem)
return elem_list
def _matstruct_to_dict(matobj):
"""Construct nested dicts from mat_struct objects."""
d = {}
for f in matobj._fieldnames:
elem = matobj.__dict__[f]
if isinstance(elem, mat_struct):
d[f] = _matstruct_to_dict(elem)
elif _has_struct(elem):
d[f] = _inspect_cell_array(elem)
else:
d[f] = elem
return d
def _simplify_cells(d):
"""Convert mat objects in dict to nested dicts."""
for key in d:
if isinstance(d[key], mat_struct):
d[key] = _matstruct_to_dict(d[key])
elif _has_struct(d[key]):
d[key] = _inspect_cell_array(d[key])
return d
class MatFile5Reader(MatFileReader):
''' Reader for Mat 5 mat files
Adds the following attribute to base class
uint16_codec - char codec to use for uint16 char arrays
(defaults to system default codec)
Uses variable reader that has the following standard interface (see
abstract class in ``miobase``::
__init__(self, file_reader)
read_header(self)
array_from_header(self)
and added interface::
set_stream(self, stream)
read_full_tag(self)
'''
@docfiller
def __init__(self,
mat_stream,
byte_order=None,
mat_dtype=False,
squeeze_me=False,
chars_as_strings=True,
matlab_compatible=False,
struct_as_record=True,
verify_compressed_data_integrity=True,
uint16_codec=None,
simplify_cells=False):
'''Initializer for matlab 5 file format reader
%(matstream_arg)s
%(load_args)s
%(struct_arg)s
uint16_codec : {None, string}
Set codec to use for uint16 char arrays (e.g., 'utf-8').
Use system default codec if None
'''
super().__init__(
mat_stream,
byte_order,
mat_dtype,
squeeze_me,
chars_as_strings,
matlab_compatible,
struct_as_record,
verify_compressed_data_integrity,
simplify_cells)
# Set uint16 codec
if not uint16_codec:
uint16_codec = sys.getdefaultencoding()
self.uint16_codec = uint16_codec
# placeholders for readers - see initialize_read method
self._file_reader = None
self._matrix_reader = None
def guess_byte_order(self):
''' Guess byte order.
Sets stream pointer to 0'''
self.mat_stream.seek(126)
mi = self.mat_stream.read(2)
self.mat_stream.seek(0)
return mi == b'IM' and '<' or '>'
def read_file_header(self):
''' Read in mat 5 file header '''
hdict = {}
hdr_dtype = MDTYPES[self.byte_order]['dtypes']['file_header']
hdr = read_dtype(self.mat_stream, hdr_dtype)
hdict['__header__'] = hdr['description'].item().strip(b' \t\n\000')
v_major = hdr['version'] >> 8
v_minor = hdr['version'] & 0xFF
hdict['__version__'] = '%d.%d' % (v_major, v_minor)
return hdict
def initialize_read(self):
''' Run when beginning read of variables
Sets up readers from parameters in `self`
'''
# reader for top level stream. We need this extra top-level
# reader because we use the matrix_reader object to contain
# compressed matrices (so they have their own stream)
self._file_reader = VarReader5(self)
# reader for matrix streams
self._matrix_reader = VarReader5(self)
def read_var_header(self):
''' Read header, return header, next position
Header has to define at least .name and .is_global
Parameters
----------
None
Returns
-------
header : object
object that can be passed to self.read_var_array, and that
has attributes .name and .is_global
next_position : int
position in stream of next variable
'''
mdtype, byte_count = self._file_reader.read_full_tag()
if not byte_count > 0:
raise ValueError("Did not read any bytes")
next_pos = self.mat_stream.tell() + byte_count
if mdtype == miCOMPRESSED:
# Make new stream from compressed data
stream = ZlibInputStream(self.mat_stream, byte_count)
self._matrix_reader.set_stream(stream)
check_stream_limit = self.verify_compressed_data_integrity
mdtype, byte_count = self._matrix_reader.read_full_tag()
else:
check_stream_limit = False
self._matrix_reader.set_stream(self.mat_stream)
if not mdtype == miMATRIX:
raise TypeError('Expecting miMATRIX type here, got %d' % mdtype)
header = self._matrix_reader.read_header(check_stream_limit)
return header, next_pos
def read_var_array(self, header, process=True):
''' Read array, given `header`
Parameters
----------
header : header object
object with fields defining variable header
process : {True, False} bool, optional
If True, apply recursive post-processing during loading of
array.
Returns
-------
arr : array
array with post-processing applied or not according to
`process`.
'''
return self._matrix_reader.array_from_header(header, process)
def get_variables(self, variable_names=None):
''' get variables from stream as dictionary
variable_names - optional list of variable names to get
If variable_names is None, then get all variables in file
'''
if isinstance(variable_names, str):
variable_names = [variable_names]
elif variable_names is not None:
variable_names = list(variable_names)
self.mat_stream.seek(0)
# Here we pass all the parameters in self to the reading objects
self.initialize_read()
mdict = self.read_file_header()
mdict['__globals__'] = []
while not self.end_of_stream():
hdr, next_position = self.read_var_header()
name = 'None' if hdr.name is None else hdr.name.decode('latin1')
if name in mdict:
msg = (
f'Duplicate variable name "{name}" in stream'
" - replacing previous with new\nConsider"
"scipy.io.matlab._mio5.varmats_from_mat to split "
"file into single variable files"
)
warnings.warn(msg, MatReadWarning, stacklevel=2)
if name == '':
# can only be a matlab 7 function workspace
name = '__function_workspace__'
# We want to keep this raw because mat_dtype processing
# will break the format (uint8 as mxDOUBLE_CLASS)
process = False
else:
process = True
if variable_names is not None and name not in variable_names:
self.mat_stream.seek(next_position)
continue
try:
res = self.read_var_array(hdr, process)
except MatReadError as err:
warnings.warn(
f'Unreadable variable "{name}", because "{err}"',
Warning, stacklevel=2)
res = "Read error: %s" % err
self.mat_stream.seek(next_position)
mdict[name] = res
if hdr.is_global:
mdict['__globals__'].append(name)
if variable_names is not None:
variable_names.remove(name)
if len(variable_names) == 0:
break
if self.simplify_cells:
return _simplify_cells(mdict)
else:
return mdict
def list_variables(self):
''' list variables from stream '''
self.mat_stream.seek(0)
# Here we pass all the parameters in self to the reading objects
self.initialize_read()
self.read_file_header()
vars = []
while not self.end_of_stream():
hdr, next_position = self.read_var_header()
name = 'None' if hdr.name is None else hdr.name.decode('latin1')
if name == '':
# can only be a matlab 7 function workspace
name = '__function_workspace__'
shape = self._matrix_reader.shape_from_header(hdr)
if hdr.is_logical:
info = 'logical'
else:
info = mclass_info.get(hdr.mclass, 'unknown')
vars.append((name, shape, info))
self.mat_stream.seek(next_position)
return vars
def varmats_from_mat(file_obj):
""" Pull variables out of mat 5 file as a sequence of mat file objects
This can be useful with a difficult mat file, containing unreadable
variables. This routine pulls the variables out in raw form and puts them,
unread, back into a file stream for saving or reading. Another use is the
pathological case where there is more than one variable of the same name in
the file; this routine returns the duplicates, whereas the standard reader
will overwrite duplicates in the returned dictionary.
The file pointer in `file_obj` will be undefined. File pointers for the
returned file-like objects are set at 0.
Parameters
----------
file_obj : file-like
file object containing mat file
Returns
-------
named_mats : list
list contains tuples of (name, BytesIO) where BytesIO is a file-like
object containing mat file contents as for a single variable. The
BytesIO contains a string with the original header and a single var. If
``var_file_obj`` is an individual BytesIO instance, then save as a mat
file with something like ``open('test.mat',
'wb').write(var_file_obj.read())``
Examples
--------
>>> import scipy.io
>>> import numpy as np
>>> from io import BytesIO
>>> from scipy.io.matlab._mio5 import varmats_from_mat
>>> mat_fileobj = BytesIO()
>>> scipy.io.savemat(mat_fileobj, {'b': np.arange(10), 'a': 'a string'})
>>> varmats = varmats_from_mat(mat_fileobj)
>>> sorted([name for name, str_obj in varmats])
['a', 'b']
"""
rdr = MatFile5Reader(file_obj)
file_obj.seek(0)
# Raw read of top-level file header
hdr_len = MDTYPES[native_code]['dtypes']['file_header'].itemsize
raw_hdr = file_obj.read(hdr_len)
# Initialize variable reading
file_obj.seek(0)
rdr.initialize_read()
rdr.read_file_header()
next_position = file_obj.tell()
named_mats = []
while not rdr.end_of_stream():
start_position = next_position
hdr, next_position = rdr.read_var_header()
name = 'None' if hdr.name is None else hdr.name.decode('latin1')
# Read raw variable string
file_obj.seek(start_position)
byte_count = next_position - start_position
var_str = file_obj.read(byte_count)
# write to stringio object
out_obj = BytesIO()
out_obj.write(raw_hdr)
out_obj.write(var_str)
out_obj.seek(0)
named_mats.append((name, out_obj))
return named_mats
class EmptyStructMarker:
""" Class to indicate presence of empty matlab struct on output """
def to_writeable(source):
''' Convert input object ``source`` to something we can write
Parameters
----------
source : object
Returns
-------
arr : None or ndarray or EmptyStructMarker
If `source` cannot be converted to something we can write to a matfile,
return None. If `source` is equivalent to an empty dictionary, return
``EmptyStructMarker``. Otherwise return `source` converted to an
ndarray with contents for writing to matfile.
'''
if isinstance(source, np.ndarray):
return source
if source is None:
return None
if hasattr(source, "__array__"):
return np.asarray(source)
# Objects that implement mappings
is_mapping = (hasattr(source, 'keys') and hasattr(source, 'values') and
hasattr(source, 'items'))
# Objects that don't implement mappings, but do have dicts
if isinstance(source, np.generic):
# NumPy scalars are never mappings (PyPy issue workaround)
pass
elif not is_mapping and hasattr(source, '__dict__'):
source = {key: value for key, value in source.__dict__.items()
if not key.startswith('_')}
is_mapping = True
if is_mapping:
dtype = []
values = []
for field, value in source.items():
if (isinstance(field, str) and
field[0] not in '_0123456789'):
dtype.append((str(field), object))
values.append(value)
if dtype:
return np.array([tuple(values)], dtype)
else:
return EmptyStructMarker
# Next try and convert to an array
try:
narr = np.asanyarray(source)
except ValueError:
narr = np.asanyarray(source, dtype=object)
if narr.dtype.type in (object, np.object_) and \
narr.shape == () and narr == source:
# No interesting conversion possible
return None
return narr
# Native byte ordered dtypes for convenience for writers
NDT_FILE_HDR = MDTYPES[native_code]['dtypes']['file_header']
NDT_TAG_FULL = MDTYPES[native_code]['dtypes']['tag_full']
NDT_TAG_SMALL = MDTYPES[native_code]['dtypes']['tag_smalldata']
NDT_ARRAY_FLAGS = MDTYPES[native_code]['dtypes']['array_flags']
class VarWriter5:
''' Generic matlab matrix writing class '''
mat_tag = np.zeros((), NDT_TAG_FULL)
mat_tag['mdtype'] = miMATRIX
def __init__(self, file_writer):
self.file_stream = file_writer.file_stream
self.unicode_strings = file_writer.unicode_strings
self.long_field_names = file_writer.long_field_names
self.oned_as = file_writer.oned_as
# These are used for top level writes, and unset after
self._var_name = None
self._var_is_global = False
def write_bytes(self, arr):
self.file_stream.write(arr.tobytes(order='F'))
def write_string(self, s):
self.file_stream.write(s)
def write_element(self, arr, mdtype=None):
''' write tag and data '''
if mdtype is None:
mdtype = NP_TO_MTYPES[arr.dtype.str[1:]]
# Array needs to be in native byte order
if arr.dtype.byteorder == swapped_code:
arr = arr.byteswap().view(arr.dtype.newbyteorder())
byte_count = arr.size*arr.itemsize
if byte_count <= 4:
self.write_smalldata_element(arr, mdtype, byte_count)
else:
self.write_regular_element(arr, mdtype, byte_count)
def write_smalldata_element(self, arr, mdtype, byte_count):
# write tag with embedded data
tag = np.zeros((), NDT_TAG_SMALL)
tag['byte_count_mdtype'] = (byte_count << 16) + mdtype
# if arr.tobytes is < 4, the element will be zero-padded as needed.
tag['data'] = arr.tobytes(order='F')
self.write_bytes(tag)
def write_regular_element(self, arr, mdtype, byte_count):
# write tag, data
tag = np.zeros((), NDT_TAG_FULL)
tag['mdtype'] = mdtype
tag['byte_count'] = byte_count
self.write_bytes(tag)
self.write_bytes(arr)
# pad to next 64-bit boundary
bc_mod_8 = byte_count % 8
if bc_mod_8:
self.file_stream.write(b'\x00' * (8-bc_mod_8))
def write_header(self,
shape,
mclass,
is_complex=False,
is_logical=False,
nzmax=0):
''' Write header for given data options
shape : sequence
array shape
mclass - mat5 matrix class
is_complex - True if matrix is complex
is_logical - True if matrix is logical
nzmax - max non zero elements for sparse arrays
We get the name and the global flag from the object, and reset
them to defaults after we've used them
'''
# get name and is_global from one-shot object store
name = self._var_name
is_global = self._var_is_global
# initialize the top-level matrix tag, store position
self._mat_tag_pos = self.file_stream.tell()
self.write_bytes(self.mat_tag)
# write array flags (complex, global, logical, class, nzmax)
af = np.zeros((), NDT_ARRAY_FLAGS)
af['data_type'] = miUINT32
af['byte_count'] = 8
flags = is_complex << 3 | is_global << 2 | is_logical << 1
af['flags_class'] = mclass | flags << 8
af['nzmax'] = nzmax
self.write_bytes(af)
# shape
self.write_element(np.array(shape, dtype='i4'))
# write name
name = np.asarray(name)
if name == '': # empty string zero-terminated
self.write_smalldata_element(name, miINT8, 0)
else:
self.write_element(name, miINT8)
# reset the one-shot store to defaults
self._var_name = ''
self._var_is_global = False
def update_matrix_tag(self, start_pos):
curr_pos = self.file_stream.tell()
self.file_stream.seek(start_pos)
byte_count = curr_pos - start_pos - 8
if byte_count >= 2**32:
raise MatWriteError("Matrix too large to save with Matlab "
"5 format")
self.mat_tag['byte_count'] = byte_count
self.write_bytes(self.mat_tag)
self.file_stream.seek(curr_pos)
def write_top(self, arr, name, is_global):
""" Write variable at top level of mat file
Parameters
----------
arr : array_like
array-like object to create writer for
name : str, optional
name as it will appear in matlab workspace
default is empty string
is_global : {False, True}, optional
whether variable will be global on load into matlab
"""
# these are set before the top-level header write, and unset at
# the end of the same write, because they do not apply for lower levels
self._var_is_global = is_global
self._var_name = name
# write the header and data
self.write(arr)
def write(self, arr):
''' Write `arr` to stream at top and sub levels
Parameters
----------
arr : array_like
array-like object to create writer for
'''
# store position, so we can update the matrix tag
mat_tag_pos = self.file_stream.tell()
# First check if these are sparse
if scipy.sparse.issparse(arr):
self.write_sparse(arr)
self.update_matrix_tag(mat_tag_pos)
return
# Try to convert things that aren't arrays
narr = to_writeable(arr)
if narr is None:
raise TypeError(f'Could not convert {arr} (type {type(arr)}) to array')
if isinstance(narr, MatlabObject):
self.write_object(narr)
elif isinstance(narr, MatlabFunction):
raise MatWriteError('Cannot write matlab functions')
elif narr is EmptyStructMarker: # empty struct array
self.write_empty_struct()
elif narr.dtype.fields: # struct array
self.write_struct(narr)
elif narr.dtype.hasobject: # cell array
self.write_cells(narr)
elif narr.dtype.kind in ('U', 'S'):
if self.unicode_strings:
codec = 'UTF8'
else:
codec = 'ascii'
self.write_char(narr, codec)
else:
self.write_numeric(narr)
self.update_matrix_tag(mat_tag_pos)
def write_numeric(self, arr):
imagf = arr.dtype.kind == 'c'
logif = arr.dtype.kind == 'b'
try:
mclass = NP_TO_MXTYPES[arr.dtype.str[1:]]
except KeyError:
# No matching matlab type, probably complex256 / float128 / float96
# Cast data to complex128 / float64.
if imagf:
arr = arr.astype('c128')
elif logif:
arr = arr.astype('i1') # Should only contain 0/1
else:
arr = arr.astype('f8')
mclass = mxDOUBLE_CLASS
self.write_header(matdims(arr, self.oned_as),
mclass,
is_complex=imagf,
is_logical=logif)
if imagf:
self.write_element(arr.real)
self.write_element(arr.imag)
else:
self.write_element(arr)
def write_char(self, arr, codec='ascii'):
''' Write string array `arr` with given `codec`
'''
if arr.size == 0 or np.all(arr == ''):
# This an empty string array or a string array containing
# only empty strings. Matlab cannot distinguish between a
# string array that is empty, and a string array containing
# only empty strings, because it stores strings as arrays of
# char. There is no way of having an array of char that is
# not empty, but contains an empty string. We have to
# special-case the array-with-empty-strings because even
# empty strings have zero padding, which would otherwise
# appear in matlab as a string with a space.
shape = (0,) * np.max([arr.ndim, 2])
self.write_header(shape, mxCHAR_CLASS)
self.write_smalldata_element(arr, miUTF8, 0)
return
# non-empty string.
#
# Convert to char array
arr = arr_to_chars(arr)
# We have to write the shape directly, because we are going
# recode the characters, and the resulting stream of chars
# may have a different length
shape = arr.shape
self.write_header(shape, mxCHAR_CLASS)
if arr.dtype.kind == 'U' and arr.size:
# Make one long string from all the characters. We need to
# transpose here, because we're flattening the array, before
# we write the bytes. The bytes have to be written in
# Fortran order.
n_chars = math.prod(shape)
st_arr = np.ndarray(shape=(),
dtype=arr_dtype_number(arr, n_chars),
buffer=arr.T.copy()) # Fortran order
# Recode with codec to give byte string
st = st_arr.item().encode(codec)
# Reconstruct as 1-D byte array
arr = np.ndarray(shape=(len(st),),
dtype='S1',
buffer=st)
self.write_element(arr, mdtype=miUTF8)
def write_sparse(self, arr):
''' Sparse matrices are 2D
'''
A = arr.tocsc() # convert to sparse CSC format
A.sort_indices() # MATLAB expects sorted row indices
is_complex = (A.dtype.kind == 'c')
is_logical = (A.dtype.kind == 'b')
nz = A.nnz
self.write_header(matdims(arr, self.oned_as),
mxSPARSE_CLASS,
is_complex=is_complex,
is_logical=is_logical,
# matlab won't load file with 0 nzmax
nzmax=1 if nz == 0 else nz)
self.write_element(A.indices.astype('i4'))
self.write_element(A.indptr.astype('i4'))
self.write_element(A.data.real)
if is_complex:
self.write_element(A.data.imag)
def write_cells(self, arr):
self.write_header(matdims(arr, self.oned_as),
mxCELL_CLASS)
# loop over data, column major
A = np.atleast_2d(arr).flatten('F')
for el in A:
self.write(el)
def write_empty_struct(self):
self.write_header((1, 1), mxSTRUCT_CLASS)
# max field name length set to 1 in an example matlab struct
self.write_element(np.array(1, dtype=np.int32))
# Field names element is empty
self.write_element(np.array([], dtype=np.int8))
def write_struct(self, arr):
self.write_header(matdims(arr, self.oned_as),
mxSTRUCT_CLASS)
self._write_items(arr)
def _write_items(self, arr):
# write fieldnames
fieldnames = [f[0] for f in arr.dtype.descr]
length = max([len(fieldname) for fieldname in fieldnames])+1
max_length = (self.long_field_names and 64) or 32
if length > max_length:
raise ValueError("Field names are restricted to %d characters" %
(max_length-1))
self.write_element(np.array([length], dtype='i4'))
self.write_element(
np.array(fieldnames, dtype='S%d' % (length)),
mdtype=miINT8)
A = np.atleast_2d(arr).flatten('F')
for el in A:
for f in fieldnames:
self.write(el[f])
def write_object(self, arr):
'''Same as writing structs, except different mx class, and extra
classname element after header
'''
self.write_header(matdims(arr, self.oned_as),
mxOBJECT_CLASS)
self.write_element(np.array(arr.classname, dtype='S'),
mdtype=miINT8)
self._write_items(arr)
class MatFile5Writer:
''' Class for writing mat5 files '''
@docfiller
def __init__(self, file_stream,
do_compression=False,
unicode_strings=False,
global_vars=None,
long_field_names=False,
oned_as='row'):
''' Initialize writer for matlab 5 format files
Parameters
----------
%(do_compression)s
%(unicode_strings)s
global_vars : None or sequence of strings, optional
Names of variables to be marked as global for matlab
%(long_fields)s
%(oned_as)s
'''
self.file_stream = file_stream
self.do_compression = do_compression
self.unicode_strings = unicode_strings
if global_vars:
self.global_vars = global_vars
else:
self.global_vars = []
self.long_field_names = long_field_names
self.oned_as = oned_as
self._matrix_writer = None
def write_file_header(self):
# write header
hdr = np.zeros((), NDT_FILE_HDR)
hdr['description'] = (f'MATLAB 5.0 MAT-file Platform: {os.name}, '
f'Created on: {time.asctime()}')
hdr['version'] = 0x0100
hdr['endian_test'] = np.ndarray(shape=(),
dtype='S2',
buffer=np.uint16(0x4d49))
self.file_stream.write(hdr.tobytes())
def put_variables(self, mdict, write_header=None):
''' Write variables in `mdict` to stream
Parameters
----------
mdict : mapping
mapping with method ``items`` returns name, contents pairs where
``name`` which will appear in the matlab workspace in file load, and
``contents`` is something writeable to a matlab file, such as a NumPy
array.
write_header : {None, True, False}, optional
If True, then write the matlab file header before writing the
variables. If None (the default) then write the file header
if we are at position 0 in the stream. By setting False
here, and setting the stream position to the end of the file,
you can append variables to a matlab file
'''
# write header if requested, or None and start of file
if write_header is None:
write_header = self.file_stream.tell() == 0
if write_header:
self.write_file_header()
self._matrix_writer = VarWriter5(self)
for name, var in mdict.items():
if name[0] == '_':
continue
is_global = name in self.global_vars
if self.do_compression:
stream = BytesIO()
self._matrix_writer.file_stream = stream
self._matrix_writer.write_top(var, name.encode('latin1'), is_global)
out_str = zlib.compress(stream.getvalue())
tag = np.empty((), NDT_TAG_FULL)
tag['mdtype'] = miCOMPRESSED
tag['byte_count'] = len(out_str)
self.file_stream.write(tag.tobytes())
self.file_stream.write(out_str)
else: # not compressing
self._matrix_writer.write_top(var, name.encode('latin1'), is_global)

View File

@ -0,0 +1,281 @@
''' Constants and classes for matlab 5 read and write
See also mio5_utils.pyx where these same constants arise as c enums.
If you make changes in this file, don't forget to change mio5_utils.pyx
'''
import numpy as np
from ._miobase import convert_dtypes
__all__ = [
'MDTYPES', 'MatlabFunction', 'MatlabObject', 'MatlabOpaque',
'NP_TO_MTYPES', 'NP_TO_MXTYPES', 'OPAQUE_DTYPE', 'codecs_template',
'mat_struct', 'mclass_dtypes_template', 'mclass_info', 'mdtypes_template',
'miCOMPRESSED', 'miDOUBLE', 'miINT16', 'miINT32', 'miINT64', 'miINT8',
'miMATRIX', 'miSINGLE', 'miUINT16', 'miUINT32', 'miUINT64', 'miUINT8',
'miUTF16', 'miUTF32', 'miUTF8', 'mxCELL_CLASS', 'mxCHAR_CLASS',
'mxDOUBLE_CLASS', 'mxFUNCTION_CLASS', 'mxINT16_CLASS', 'mxINT32_CLASS',
'mxINT64_CLASS', 'mxINT8_CLASS', 'mxOBJECT_CLASS',
'mxOBJECT_CLASS_FROM_MATRIX_H', 'mxOPAQUE_CLASS', 'mxSINGLE_CLASS',
'mxSPARSE_CLASS', 'mxSTRUCT_CLASS', 'mxUINT16_CLASS', 'mxUINT32_CLASS',
'mxUINT64_CLASS', 'mxUINT8_CLASS'
]
miINT8 = 1
miUINT8 = 2
miINT16 = 3
miUINT16 = 4
miINT32 = 5
miUINT32 = 6
miSINGLE = 7
miDOUBLE = 9
miINT64 = 12
miUINT64 = 13
miMATRIX = 14
miCOMPRESSED = 15
miUTF8 = 16
miUTF16 = 17
miUTF32 = 18
mxCELL_CLASS = 1
mxSTRUCT_CLASS = 2
# The March 2008 edition of "Matlab 7 MAT-File Format" says that
# mxOBJECT_CLASS = 3, whereas matrix.h says that mxLOGICAL = 3.
# Matlab 2008a appears to save logicals as type 9, so we assume that
# the document is correct. See type 18, below.
mxOBJECT_CLASS = 3
mxCHAR_CLASS = 4
mxSPARSE_CLASS = 5
mxDOUBLE_CLASS = 6
mxSINGLE_CLASS = 7
mxINT8_CLASS = 8
mxUINT8_CLASS = 9
mxINT16_CLASS = 10
mxUINT16_CLASS = 11
mxINT32_CLASS = 12
mxUINT32_CLASS = 13
# The following are not in the March 2008 edition of "Matlab 7
# MAT-File Format," but were guessed from matrix.h.
mxINT64_CLASS = 14
mxUINT64_CLASS = 15
mxFUNCTION_CLASS = 16
# Not doing anything with these at the moment.
mxOPAQUE_CLASS = 17 # This appears to be a function workspace
# Thread 'saving/loading symbol table of annymous functions',
# octave-maintainers, April-May 2007
# https://lists.gnu.org/archive/html/octave-maintainers/2007-04/msg00031.html
# https://lists.gnu.org/archive/html/octave-maintainers/2007-05/msg00032.html
# (Was/Deprecated: https://www-old.cae.wisc.edu/pipermail/octave-maintainers/2007-May/002824.html)
mxOBJECT_CLASS_FROM_MATRIX_H = 18
mdtypes_template = {
miINT8: 'i1',
miUINT8: 'u1',
miINT16: 'i2',
miUINT16: 'u2',
miINT32: 'i4',
miUINT32: 'u4',
miSINGLE: 'f4',
miDOUBLE: 'f8',
miINT64: 'i8',
miUINT64: 'u8',
miUTF8: 'u1',
miUTF16: 'u2',
miUTF32: 'u4',
'file_header': [('description', 'S116'),
('subsystem_offset', 'i8'),
('version', 'u2'),
('endian_test', 'S2')],
'tag_full': [('mdtype', 'u4'), ('byte_count', 'u4')],
'tag_smalldata':[('byte_count_mdtype', 'u4'), ('data', 'S4')],
'array_flags': [('data_type', 'u4'),
('byte_count', 'u4'),
('flags_class','u4'),
('nzmax', 'u4')],
'U1': 'U1',
}
mclass_dtypes_template = {
mxINT8_CLASS: 'i1',
mxUINT8_CLASS: 'u1',
mxINT16_CLASS: 'i2',
mxUINT16_CLASS: 'u2',
mxINT32_CLASS: 'i4',
mxUINT32_CLASS: 'u4',
mxINT64_CLASS: 'i8',
mxUINT64_CLASS: 'u8',
mxSINGLE_CLASS: 'f4',
mxDOUBLE_CLASS: 'f8',
}
mclass_info = {
mxINT8_CLASS: 'int8',
mxUINT8_CLASS: 'uint8',
mxINT16_CLASS: 'int16',
mxUINT16_CLASS: 'uint16',
mxINT32_CLASS: 'int32',
mxUINT32_CLASS: 'uint32',
mxINT64_CLASS: 'int64',
mxUINT64_CLASS: 'uint64',
mxSINGLE_CLASS: 'single',
mxDOUBLE_CLASS: 'double',
mxCELL_CLASS: 'cell',
mxSTRUCT_CLASS: 'struct',
mxOBJECT_CLASS: 'object',
mxCHAR_CLASS: 'char',
mxSPARSE_CLASS: 'sparse',
mxFUNCTION_CLASS: 'function',
mxOPAQUE_CLASS: 'opaque',
}
NP_TO_MTYPES = {
'f8': miDOUBLE,
'c32': miDOUBLE,
'c24': miDOUBLE,
'c16': miDOUBLE,
'f4': miSINGLE,
'c8': miSINGLE,
'i8': miINT64,
'i4': miINT32,
'i2': miINT16,
'i1': miINT8,
'u8': miUINT64,
'u4': miUINT32,
'u2': miUINT16,
'u1': miUINT8,
'S1': miUINT8,
'U1': miUTF16,
'b1': miUINT8, # not standard but seems MATLAB uses this (gh-4022)
}
NP_TO_MXTYPES = {
'f8': mxDOUBLE_CLASS,
'c32': mxDOUBLE_CLASS,
'c24': mxDOUBLE_CLASS,
'c16': mxDOUBLE_CLASS,
'f4': mxSINGLE_CLASS,
'c8': mxSINGLE_CLASS,
'i8': mxINT64_CLASS,
'i4': mxINT32_CLASS,
'i2': mxINT16_CLASS,
'i1': mxINT8_CLASS,
'u8': mxUINT64_CLASS,
'u4': mxUINT32_CLASS,
'u2': mxUINT16_CLASS,
'u1': mxUINT8_CLASS,
'S1': mxUINT8_CLASS,
'b1': mxUINT8_CLASS, # not standard but seems MATLAB uses this
}
''' Before release v7.1 (release 14) matlab (TM) used the system
default character encoding scheme padded out to 16-bits. Release 14
and later use Unicode. When saving character data, R14 checks if it
can be encoded in 7-bit ascii, and saves in that format if so.'''
codecs_template = {
miUTF8: {'codec': 'utf_8', 'width': 1},
miUTF16: {'codec': 'utf_16', 'width': 2},
miUTF32: {'codec': 'utf_32','width': 4},
}
def _convert_codecs(template, byte_order):
''' Convert codec template mapping to byte order
Set codecs not on this system to None
Parameters
----------
template : mapping
key, value are respectively codec name, and root name for codec
(without byte order suffix)
byte_order : {'<', '>'}
code for little or big endian
Returns
-------
codecs : dict
key, value are name, codec (as in .encode(codec))
'''
codecs = {}
postfix = byte_order == '<' and '_le' or '_be'
for k, v in template.items():
codec = v['codec']
try:
" ".encode(codec)
except LookupError:
codecs[k] = None
continue
if v['width'] > 1:
codec += postfix
codecs[k] = codec
return codecs.copy()
MDTYPES = {}
for _bytecode in '<>':
_def = {'dtypes': convert_dtypes(mdtypes_template, _bytecode),
'classes': convert_dtypes(mclass_dtypes_template, _bytecode),
'codecs': _convert_codecs(codecs_template, _bytecode)}
MDTYPES[_bytecode] = _def
class mat_struct:
"""Placeholder for holding read data from structs.
We use instances of this class when the user passes False as a value to the
``struct_as_record`` parameter of the :func:`scipy.io.loadmat` function.
"""
pass
class MatlabObject(np.ndarray):
"""Subclass of ndarray to signal this is a matlab object.
This is a simple subclass of :class:`numpy.ndarray` meant to be used
by :func:`scipy.io.loadmat` and should not be instantiated directly.
"""
def __new__(cls, input_array, classname=None):
# Input array is an already formed ndarray instance
# We first cast to be our class type
obj = np.asarray(input_array).view(cls)
# add the new attribute to the created instance
obj.classname = classname
# Finally, we must return the newly created object:
return obj
def __array_finalize__(self,obj):
# reset the attribute from passed original object
self.classname = getattr(obj, 'classname', None)
# We do not need to return anything
class MatlabFunction(np.ndarray):
"""Subclass for a MATLAB function.
This is a simple subclass of :class:`numpy.ndarray` meant to be used
by :func:`scipy.io.loadmat` and should not be directly instantiated.
"""
def __new__(cls, input_array):
obj = np.asarray(input_array).view(cls)
return obj
class MatlabOpaque(np.ndarray):
"""Subclass for a MATLAB opaque matrix.
This is a simple subclass of :class:`numpy.ndarray` meant to be used
by :func:`scipy.io.loadmat` and should not be directly instantiated.
"""
def __new__(cls, input_array):
obj = np.asarray(input_array).view(cls)
return obj
OPAQUE_DTYPE = np.dtype(
[('s0', 'O'), ('s1', 'O'), ('s2', 'O'), ('arr', 'O')])

View File

@ -0,0 +1,425 @@
# Authors: Travis Oliphant, Matthew Brett
"""
Base classes for MATLAB file stream reading.
MATLAB is a registered trademark of the Mathworks inc.
"""
import numpy as np
from scipy._lib import doccer
from . import _byteordercodes as boc
__all__ = [
'MatReadError', 'MatReadWarning', 'MatWriteError',
]
class MatReadError(Exception):
"""Exception indicating a read issue."""
class MatWriteError(Exception):
"""Exception indicating a write issue."""
class MatReadWarning(UserWarning):
"""Warning class for read issues."""
doc_dict = \
{'file_arg':
'''file_name : str
Name of the mat file (do not need .mat extension if
appendmat==True) Can also pass open file-like object.''',
'append_arg':
'''appendmat : bool, optional
True to append the .mat extension to the end of the given
filename, if not already present. Default is True.''',
'load_args':
'''byte_order : str or None, optional
None by default, implying byte order guessed from mat
file. Otherwise can be one of ('native', '=', 'little', '<',
'BIG', '>').
mat_dtype : bool, optional
If True, return arrays in same dtype as would be loaded into
MATLAB (instead of the dtype with which they are saved).
squeeze_me : bool, optional
Whether to squeeze unit matrix dimensions or not.
chars_as_strings : bool, optional
Whether to convert char arrays to string arrays.
matlab_compatible : bool, optional
Returns matrices as would be loaded by MATLAB (implies
squeeze_me=False, chars_as_strings=False, mat_dtype=True,
struct_as_record=True).''',
'struct_arg':
'''struct_as_record : bool, optional
Whether to load MATLAB structs as NumPy record arrays, or as
old-style NumPy arrays with dtype=object. Setting this flag to
False replicates the behavior of SciPy version 0.7.x (returning
numpy object arrays). The default setting is True, because it
allows easier round-trip load and save of MATLAB files.''',
'matstream_arg':
'''mat_stream : file-like
Object with file API, open for reading.''',
'long_fields':
'''long_field_names : bool, optional
* False - maximum field name length in a structure is 31 characters
which is the documented maximum length. This is the default.
* True - maximum field name length in a structure is 63 characters
which works for MATLAB 7.6''',
'do_compression':
'''do_compression : bool, optional
Whether to compress matrices on write. Default is False.''',
'oned_as':
'''oned_as : {'row', 'column'}, optional
If 'column', write 1-D NumPy arrays as column vectors.
If 'row', write 1D NumPy arrays as row vectors.''',
'unicode_strings':
'''unicode_strings : bool, optional
If True, write strings as Unicode, else MATLAB usual encoding.'''}
docfiller = doccer.filldoc(doc_dict)
'''
Note on architecture
======================
There are three sets of parameters relevant for reading files. The
first are *file read parameters* - containing options that are common
for reading the whole file, and therefore every variable within that
file. At the moment these are:
* mat_stream
* dtypes (derived from byte code)
* byte_order
* chars_as_strings
* squeeze_me
* struct_as_record (MATLAB 5 files)
* class_dtypes (derived from order code, MATLAB 5 files)
* codecs (MATLAB 5 files)
* uint16_codec (MATLAB 5 files)
Another set of parameters are those that apply only to the current
variable being read - the *header*:
* header related variables (different for v4 and v5 mat files)
* is_complex
* mclass
* var_stream
With the header, we need ``next_position`` to tell us where the next
variable in the stream is.
Then, for each element in a matrix, there can be *element read
parameters*. An element is, for example, one element in a MATLAB cell
array. At the moment, these are:
* mat_dtype
The file-reading object contains the *file read parameters*. The
*header* is passed around as a data object, or may be read and discarded
in a single function. The *element read parameters* - the mat_dtype in
this instance, is passed into a general post-processing function - see
``mio_utils`` for details.
'''
def convert_dtypes(dtype_template, order_code):
''' Convert dtypes in mapping to given order
Parameters
----------
dtype_template : mapping
mapping with values returning numpy dtype from ``np.dtype(val)``
order_code : str
an order code suitable for using in ``dtype.newbyteorder()``
Returns
-------
dtypes : mapping
mapping where values have been replaced by
``np.dtype(val).newbyteorder(order_code)``
'''
dtypes = dtype_template.copy()
for k in dtypes:
dtypes[k] = np.dtype(dtypes[k]).newbyteorder(order_code)
return dtypes
def read_dtype(mat_stream, a_dtype):
"""
Generic get of byte stream data of known type
Parameters
----------
mat_stream : file_like object
MATLAB (tm) mat file stream
a_dtype : dtype
dtype of array to read. `a_dtype` is assumed to be correct
endianness.
Returns
-------
arr : ndarray
Array of dtype `a_dtype` read from stream.
"""
num_bytes = a_dtype.itemsize
arr = np.ndarray(shape=(),
dtype=a_dtype,
buffer=mat_stream.read(num_bytes),
order='F')
return arr
def matfile_version(file_name, *, appendmat=True):
"""
Return major, minor tuple depending on apparent mat file type
Where:
#. 0,x -> version 4 format mat files
#. 1,x -> version 5 format mat files
#. 2,x -> version 7.3 format mat files (HDF format)
Parameters
----------
file_name : str
Name of the mat file (do not need .mat extension if
appendmat==True). Can also pass open file-like object.
appendmat : bool, optional
True to append the .mat extension to the end of the given
filename, if not already present. Default is True.
Returns
-------
major_version : {0, 1, 2}
major MATLAB File format version
minor_version : int
minor MATLAB file format version
Raises
------
MatReadError
If the file is empty.
ValueError
The matfile version is unknown.
Notes
-----
Has the side effect of setting the file read pointer to 0
"""
from ._mio import _open_file_context
with _open_file_context(file_name, appendmat=appendmat) as fileobj:
return _get_matfile_version(fileobj)
get_matfile_version = matfile_version
def _get_matfile_version(fileobj):
# Mat4 files have a zero somewhere in first 4 bytes
fileobj.seek(0)
mopt_bytes = fileobj.read(4)
if len(mopt_bytes) == 0:
raise MatReadError("Mat file appears to be empty")
mopt_ints = np.ndarray(shape=(4,), dtype=np.uint8, buffer=mopt_bytes)
if 0 in mopt_ints:
fileobj.seek(0)
return (0,0)
# For 5 format or 7.3 format we need to read an integer in the
# header. Bytes 124 through 128 contain a version integer and an
# endian test string
fileobj.seek(124)
tst_str = fileobj.read(4)
fileobj.seek(0)
maj_ind = int(tst_str[2] == b'I'[0])
maj_val = int(tst_str[maj_ind])
min_val = int(tst_str[1 - maj_ind])
ret = (maj_val, min_val)
if maj_val in (1, 2):
return ret
raise ValueError('Unknown mat file type, version {}, {}'.format(*ret))
def matdims(arr, oned_as='column'):
"""
Determine equivalent MATLAB dimensions for given array
Parameters
----------
arr : ndarray
Input array
oned_as : {'column', 'row'}, optional
Whether 1-D arrays are returned as MATLAB row or column matrices.
Default is 'column'.
Returns
-------
dims : tuple
Shape tuple, in the form MATLAB expects it.
Notes
-----
We had to decide what shape a 1 dimensional array would be by
default. ``np.atleast_2d`` thinks it is a row vector. The
default for a vector in MATLAB (e.g., ``>> 1:12``) is a row vector.
Versions of scipy up to and including 0.11 resulted (accidentally)
in 1-D arrays being read as column vectors. For the moment, we
maintain the same tradition here.
Examples
--------
>>> import numpy as np
>>> from scipy.io.matlab._miobase import matdims
>>> matdims(np.array(1)) # NumPy scalar
(1, 1)
>>> matdims(np.array([1])) # 1-D array, 1 element
(1, 1)
>>> matdims(np.array([1,2])) # 1-D array, 2 elements
(2, 1)
>>> matdims(np.array([[2],[3]])) # 2-D array, column vector
(2, 1)
>>> matdims(np.array([[2,3]])) # 2-D array, row vector
(1, 2)
>>> matdims(np.array([[[2,3]]])) # 3-D array, rowish vector
(1, 1, 2)
>>> matdims(np.array([])) # empty 1-D array
(0, 0)
>>> matdims(np.array([[]])) # empty 2-D array
(0, 0)
>>> matdims(np.array([[[]]])) # empty 3-D array
(0, 0, 0)
Optional argument flips 1-D shape behavior.
>>> matdims(np.array([1,2]), 'row') # 1-D array, 2 elements
(1, 2)
The argument has to make sense though
>>> matdims(np.array([1,2]), 'bizarre')
Traceback (most recent call last):
...
ValueError: 1-D option "bizarre" is strange
"""
shape = arr.shape
if shape == (): # scalar
return (1, 1)
if len(shape) == 1: # 1D
if shape[0] == 0:
return (0, 0)
elif oned_as == 'column':
return shape + (1,)
elif oned_as == 'row':
return (1,) + shape
else:
raise ValueError('1-D option "%s" is strange'
% oned_as)
return shape
class MatVarReader:
''' Abstract class defining required interface for var readers'''
def __init__(self, file_reader):
pass
def read_header(self):
''' Returns header '''
pass
def array_from_header(self, header):
''' Reads array given header '''
pass
class MatFileReader:
""" Base object for reading mat files
To make this class functional, you will need to override the
following methods:
matrix_getter_factory - gives object to fetch next matrix from stream
guess_byte_order - guesses file byte order from file
"""
@docfiller
def __init__(self, mat_stream,
byte_order=None,
mat_dtype=False,
squeeze_me=False,
chars_as_strings=True,
matlab_compatible=False,
struct_as_record=True,
verify_compressed_data_integrity=True,
simplify_cells=False):
'''
Initializer for mat file reader
mat_stream : file-like
object with file API, open for reading
%(load_args)s
'''
# Initialize stream
self.mat_stream = mat_stream
self.dtypes = {}
if not byte_order:
byte_order = self.guess_byte_order()
else:
byte_order = boc.to_numpy_code(byte_order)
self.byte_order = byte_order
self.struct_as_record = struct_as_record
if matlab_compatible:
self.set_matlab_compatible()
else:
self.squeeze_me = squeeze_me
self.chars_as_strings = chars_as_strings
self.mat_dtype = mat_dtype
self.verify_compressed_data_integrity = verify_compressed_data_integrity
self.simplify_cells = simplify_cells
if simplify_cells:
self.squeeze_me = True
self.struct_as_record = False
def set_matlab_compatible(self):
''' Sets options to return arrays as MATLAB loads them '''
self.mat_dtype = True
self.squeeze_me = False
self.chars_as_strings = False
def guess_byte_order(self):
''' As we do not know what file type we have, assume native '''
return boc.native_code
def end_of_stream(self):
b = self.mat_stream.read(1)
curpos = self.mat_stream.tell()
self.mat_stream.seek(curpos-1)
return len(b) == 0
def arr_dtype_number(arr, num):
''' Return dtype for given number of items per element'''
return np.dtype(arr.dtype.str[:2] + str(num))
def arr_to_chars(arr):
''' Convert string array to char array '''
dims = list(arr.shape)
if not dims:
dims = [1]
dims.append(int(arr.dtype.str[2:]))
arr = np.ndarray(shape=dims,
dtype=arr_dtype_number(arr, 1),
buffer=arr)
empties = [arr == np.array('', dtype=arr.dtype)]
if not np.any(empties):
return arr
arr = arr.copy()
arr[tuple(empties)] = ' '
return arr

View File

@ -0,0 +1,17 @@
# This file is not meant for public use and will be removed in SciPy v2.0.0.
# Use the `scipy.io.matlab` namespace for importing the functions
# included below.
from scipy._lib.deprecation import _sub_module_deprecation
__all__: list[str] = []
def __dir__():
return __all__
def __getattr__(name):
return _sub_module_deprecation(sub_package="io.matlab", module="byteordercodes",
private_modules=["_byteordercodes"], all=__all__,
attribute=name)

View File

@ -0,0 +1,16 @@
# This file is not meant for public use and will be removed in SciPy v2.0.0.
# Use the `scipy.io.matlab` namespace for importing the functions
# included below.
from scipy._lib.deprecation import _sub_module_deprecation
__all__ = ["loadmat", "savemat", "whosmat"] # noqa: F822
def __dir__():
return __all__
def __getattr__(name):
return _sub_module_deprecation(sub_package="io.matlab", module="mio",
private_modules=["_mio"], all=__all__,
attribute=name)

View File

@ -0,0 +1,17 @@
# This file is not meant for public use and will be removed in SciPy v2.0.0.
# Use the `scipy.io.matlab` namespace for importing the functions
# included below.
from scipy._lib.deprecation import _sub_module_deprecation
__all__: list[str] = []
def __dir__():
return __all__
def __getattr__(name):
return _sub_module_deprecation(sub_package="io.matlab", module="mio4",
private_modules=["_mio4"], all=__all__,
attribute=name)

View File

@ -0,0 +1,19 @@
# This file is not meant for public use and will be removed in SciPy v2.0.0.
# Use the `scipy.io.matlab` namespace for importing the functions
# included below.
from scipy._lib.deprecation import _sub_module_deprecation
__all__ = [ # noqa: F822
'MatWriteError', 'MatReadError', 'MatReadWarning', 'MatlabObject',
'MatlabFunction', 'mat_struct', 'varmats_from_mat',
]
def __dir__():
return __all__
def __getattr__(name):
return _sub_module_deprecation(sub_package="io.matlab", module="mio5",
private_modules=["_mio5"], all=__all__,
attribute=name)

View File

@ -0,0 +1,18 @@
# This file is not meant for public use and will be removed in SciPy v2.0.0.
# Use the `scipy.io.matlab` namespace for importing the functions
# included below.
from scipy._lib.deprecation import _sub_module_deprecation
__all__ = [ # noqa: F822
'MatlabFunction', 'MatlabObject', 'MatlabOpaque', 'mat_struct',
]
def __dir__():
return __all__
def __getattr__(name):
return _sub_module_deprecation(sub_package="io.matlab", module="mio5_params",
private_modules=["_mio5_params"], all=__all__,
attribute=name)

View File

@ -0,0 +1,17 @@
# This file is not meant for public use and will be removed in SciPy v2.0.0.
# Use the `scipy.io.matlab` namespace for importing the functions
# included below.
from scipy._lib.deprecation import _sub_module_deprecation
__all__: list[str] = []
def __dir__():
return __all__
def __getattr__(name):
return _sub_module_deprecation(sub_package="io.matlab", module="mio5_utils",
private_modules=["_mio5_utils"], all=__all__,
attribute=name)

View File

@ -0,0 +1,17 @@
# This file is not meant for public use and will be removed in SciPy v2.0.0.
# Use the `scipy.io.matlab` namespace for importing the functions
# included below.
from scipy._lib.deprecation import _sub_module_deprecation
__all__: list[str] = []
def __dir__():
return __all__
def __getattr__(name):
return _sub_module_deprecation(sub_package="io.matlab", module="mio_utils",
private_modules=["_mio_utils"], all=__all__,
attribute=name)

View File

@ -0,0 +1,16 @@
# This file is not meant for public use and will be removed in SciPy v2.0.0.
# Use the `scipy.io.matlab` namespace for importing the functions
# included below.
from scipy._lib.deprecation import _sub_module_deprecation
__all__ = ["MatReadError", "MatReadWarning", "MatWriteError"] # noqa: F822
def __dir__():
return __all__
def __getattr__(name):
return _sub_module_deprecation(sub_package="io.matlab", module="miobase",
private_modules=["_miobase"], all=__all__,
attribute=name)

View File

@ -0,0 +1,16 @@
# This file is not meant for public use and will be removed in SciPy v2.0.0.
# Use the `scipy.io.matlab` namespace for importing the functions
# included below.
from scipy._lib.deprecation import _sub_module_deprecation
__all__: list[str] = []
def __dir__():
return __all__
def __getattr__(name):
return _sub_module_deprecation(sub_package="io.matlab", module="streams",
private_modules=["_streams"], all=__all__,
attribute=name)

View File

@ -0,0 +1,5 @@
Japanese:
すべての人間は、生まれながらにして自由であり、
かつ、尊厳と権利と について平等である。
人間は、理性と良心とを授けられており、
互いに同胞の精神をもって行動しなければならない。

Some files were not shown because too many files have changed in this diff Show More