This commit is contained in:
2024-11-29 18:15:30 +00:00
parent 40aade2d8e
commit bc9415586e
5298 changed files with 1938676 additions and 80 deletions

View File

@ -0,0 +1,649 @@
"""
.. _statsrefmanual:
==========================================
Statistical functions (:mod:`scipy.stats`)
==========================================
.. currentmodule:: scipy.stats
This module contains a large number of probability distributions,
summary and frequency statistics, correlation functions and statistical
tests, masked statistics, kernel density estimation, quasi-Monte Carlo
functionality, and more.
Statistics is a very large area, and there are topics that are out of scope
for SciPy and are covered by other packages. Some of the most important ones
are:
- `statsmodels <https://www.statsmodels.org/stable/index.html>`__:
regression, linear models, time series analysis, extensions to topics
also covered by ``scipy.stats``.
- `Pandas <https://pandas.pydata.org/>`__: tabular data, time series
functionality, interfaces to other statistical languages.
- `PyMC <https://docs.pymc.io/>`__: Bayesian statistical
modeling, probabilistic machine learning.
- `scikit-learn <https://scikit-learn.org/>`__: classification, regression,
model selection.
- `Seaborn <https://seaborn.pydata.org/>`__: statistical data visualization.
- `rpy2 <https://rpy2.github.io/>`__: Python to R bridge.
Probability distributions
=========================
Each univariate distribution is an instance of a subclass of `rv_continuous`
(`rv_discrete` for discrete distributions):
.. autosummary::
:toctree: generated/
rv_continuous
rv_discrete
rv_histogram
Continuous distributions
------------------------
.. autosummary::
:toctree: generated/
alpha -- Alpha
anglit -- Anglit
arcsine -- Arcsine
argus -- Argus
beta -- Beta
betaprime -- Beta Prime
bradford -- Bradford
burr -- Burr (Type III)
burr12 -- Burr (Type XII)
cauchy -- Cauchy
chi -- Chi
chi2 -- Chi-squared
cosine -- Cosine
crystalball -- Crystalball
dgamma -- Double Gamma
dweibull -- Double Weibull
erlang -- Erlang
expon -- Exponential
exponnorm -- Exponentially Modified Normal
exponweib -- Exponentiated Weibull
exponpow -- Exponential Power
f -- F (Snecdor F)
fatiguelife -- Fatigue Life (Birnbaum-Saunders)
fisk -- Fisk
foldcauchy -- Folded Cauchy
foldnorm -- Folded Normal
genlogistic -- Generalized Logistic
gennorm -- Generalized normal
genpareto -- Generalized Pareto
genexpon -- Generalized Exponential
genextreme -- Generalized Extreme Value
gausshyper -- Gauss Hypergeometric
gamma -- Gamma
gengamma -- Generalized gamma
genhalflogistic -- Generalized Half Logistic
genhyperbolic -- Generalized Hyperbolic
geninvgauss -- Generalized Inverse Gaussian
gibrat -- Gibrat
gompertz -- Gompertz (Truncated Gumbel)
gumbel_r -- Right Sided Gumbel, Log-Weibull, Fisher-Tippett, Extreme Value Type I
gumbel_l -- Left Sided Gumbel, etc.
halfcauchy -- Half Cauchy
halflogistic -- Half Logistic
halfnorm -- Half Normal
halfgennorm -- Generalized Half Normal
hypsecant -- Hyperbolic Secant
invgamma -- Inverse Gamma
invgauss -- Inverse Gaussian
invweibull -- Inverse Weibull
irwinhall -- Irwin-Hall
jf_skew_t -- Jones and Faddy Skew-T
johnsonsb -- Johnson SB
johnsonsu -- Johnson SU
kappa4 -- Kappa 4 parameter
kappa3 -- Kappa 3 parameter
ksone -- Distribution of Kolmogorov-Smirnov one-sided test statistic
kstwo -- Distribution of Kolmogorov-Smirnov two-sided test statistic
kstwobign -- Limiting Distribution of scaled Kolmogorov-Smirnov two-sided test statistic.
laplace -- Laplace
laplace_asymmetric -- Asymmetric Laplace
levy -- Levy
levy_l
levy_stable
logistic -- Logistic
loggamma -- Log-Gamma
loglaplace -- Log-Laplace (Log Double Exponential)
lognorm -- Log-Normal
loguniform -- Log-Uniform
lomax -- Lomax (Pareto of the second kind)
maxwell -- Maxwell
mielke -- Mielke's Beta-Kappa
moyal -- Moyal
nakagami -- Nakagami
ncx2 -- Non-central chi-squared
ncf -- Non-central F
nct -- Non-central Student's T
norm -- Normal (Gaussian)
norminvgauss -- Normal Inverse Gaussian
pareto -- Pareto
pearson3 -- Pearson type III
powerlaw -- Power-function
powerlognorm -- Power log normal
powernorm -- Power normal
rdist -- R-distribution
rayleigh -- Rayleigh
rel_breitwigner -- Relativistic Breit-Wigner
rice -- Rice
recipinvgauss -- Reciprocal Inverse Gaussian
semicircular -- Semicircular
skewcauchy -- Skew Cauchy
skewnorm -- Skew normal
studentized_range -- Studentized Range
t -- Student's T
trapezoid -- Trapezoidal
triang -- Triangular
truncexpon -- Truncated Exponential
truncnorm -- Truncated Normal
truncpareto -- Truncated Pareto
truncweibull_min -- Truncated minimum Weibull distribution
tukeylambda -- Tukey-Lambda
uniform -- Uniform
vonmises -- Von-Mises (Circular)
vonmises_line -- Von-Mises (Line)
wald -- Wald
weibull_min -- Minimum Weibull (see Frechet)
weibull_max -- Maximum Weibull (see Frechet)
wrapcauchy -- Wrapped Cauchy
The ``fit`` method of the univariate continuous distributions uses
maximum likelihood estimation to fit the distribution to a data set.
The ``fit`` method can accept regular data or *censored data*.
Censored data is represented with instances of the `CensoredData`
class.
.. autosummary::
:toctree: generated/
CensoredData
Multivariate distributions
--------------------------
.. autosummary::
:toctree: generated/
multivariate_normal -- Multivariate normal distribution
matrix_normal -- Matrix normal distribution
dirichlet -- Dirichlet
dirichlet_multinomial -- Dirichlet multinomial distribution
wishart -- Wishart
invwishart -- Inverse Wishart
multinomial -- Multinomial distribution
special_ortho_group -- SO(N) group
ortho_group -- O(N) group
unitary_group -- U(N) group
random_correlation -- random correlation matrices
multivariate_t -- Multivariate t-distribution
multivariate_hypergeom -- Multivariate hypergeometric distribution
random_table -- Distribution of random tables with given marginals
uniform_direction -- Uniform distribution on S(N-1)
vonmises_fisher -- Von Mises-Fisher distribution
`scipy.stats.multivariate_normal` methods accept instances
of the following class to represent the covariance.
.. autosummary::
:toctree: generated/
Covariance -- Representation of a covariance matrix
Discrete distributions
----------------------
.. autosummary::
:toctree: generated/
bernoulli -- Bernoulli
betabinom -- Beta-Binomial
betanbinom -- Beta-Negative Binomial
binom -- Binomial
boltzmann -- Boltzmann (Truncated Discrete Exponential)
dlaplace -- Discrete Laplacian
geom -- Geometric
hypergeom -- Hypergeometric
logser -- Logarithmic (Log-Series, Series)
nbinom -- Negative Binomial
nchypergeom_fisher -- Fisher's Noncentral Hypergeometric
nchypergeom_wallenius -- Wallenius's Noncentral Hypergeometric
nhypergeom -- Negative Hypergeometric
planck -- Planck (Discrete Exponential)
poisson -- Poisson
randint -- Discrete Uniform
skellam -- Skellam
yulesimon -- Yule-Simon
zipf -- Zipf (Zeta)
zipfian -- Zipfian
An overview of statistical functions is given below. Many of these functions
have a similar version in `scipy.stats.mstats` which work for masked arrays.
Summary statistics
==================
.. autosummary::
:toctree: generated/
describe -- Descriptive statistics
gmean -- Geometric mean
hmean -- Harmonic mean
pmean -- Power mean
kurtosis -- Fisher or Pearson kurtosis
mode -- Modal value
moment -- Central moment
expectile -- Expectile
skew -- Skewness
kstat --
kstatvar --
tmean -- Truncated arithmetic mean
tvar -- Truncated variance
tmin --
tmax --
tstd --
tsem --
variation -- Coefficient of variation
find_repeats
rankdata
tiecorrect
trim_mean
gstd -- Geometric Standard Deviation
iqr
sem
bayes_mvs
mvsdist
entropy
differential_entropy
median_abs_deviation
Frequency statistics
====================
.. autosummary::
:toctree: generated/
cumfreq
percentileofscore
scoreatpercentile
relfreq
.. autosummary::
:toctree: generated/
binned_statistic -- Compute a binned statistic for a set of data.
binned_statistic_2d -- Compute a 2-D binned statistic for a set of data.
binned_statistic_dd -- Compute a d-D binned statistic for a set of data.
.. _hypotests:
Hypothesis Tests and related functions
======================================
SciPy has many functions for performing hypothesis tests that return a
test statistic and a p-value, and several of them return confidence intervals
and/or other related information.
The headings below are based on common uses of the functions within, but due to
the wide variety of statistical procedures, any attempt at coarse-grained
categorization will be imperfect. Also, note that tests within the same heading
are not interchangeable in general (e.g. many have different distributional
assumptions).
One Sample Tests / Paired Sample Tests
--------------------------------------
One sample tests are typically used to assess whether a single sample was
drawn from a specified distribution or a distribution with specified properties
(e.g. zero mean).
.. autosummary::
:toctree: generated/
ttest_1samp
binomtest
quantile_test
skewtest
kurtosistest
normaltest
jarque_bera
shapiro
anderson
cramervonmises
ks_1samp
goodness_of_fit
chisquare
power_divergence
Paired sample tests are often used to assess whether two samples were drawn
from the same distribution; they differ from the independent sample tests below
in that each observation in one sample is treated as paired with a
closely-related observation in the other sample (e.g. when environmental
factors are controlled between observations within a pair but not among pairs).
They can also be interpreted or used as one-sample tests (e.g. tests on the
mean or median of *differences* between paired observations).
.. autosummary::
:toctree: generated/
ttest_rel
wilcoxon
Association/Correlation Tests
-----------------------------
These tests are often used to assess whether there is a relationship (e.g.
linear) between paired observations in multiple samples or among the
coordinates of multivariate observations.
.. autosummary::
:toctree: generated/
linregress
pearsonr
spearmanr
pointbiserialr
kendalltau
weightedtau
somersd
siegelslopes
theilslopes
page_trend_test
multiscale_graphcorr
These association tests and are to work with samples in the form of contingency
tables. Supporting functions are available in `scipy.stats.contingency`.
.. autosummary::
:toctree: generated/
chi2_contingency
fisher_exact
barnard_exact
boschloo_exact
Independent Sample Tests
------------------------
Independent sample tests are typically used to assess whether multiple samples
were independently drawn from the same distribution or different distributions
with a shared property (e.g. equal means).
Some tests are specifically for comparing two samples.
.. autosummary::
:toctree: generated/
ttest_ind_from_stats
poisson_means_test
ttest_ind
mannwhitneyu
bws_test
ranksums
brunnermunzel
mood
ansari
cramervonmises_2samp
epps_singleton_2samp
ks_2samp
kstest
Others are generalized to multiple samples.
.. autosummary::
:toctree: generated/
f_oneway
tukey_hsd
dunnett
kruskal
alexandergovern
fligner
levene
bartlett
median_test
friedmanchisquare
anderson_ksamp
Resampling and Monte Carlo Methods
----------------------------------
The following functions can reproduce the p-value and confidence interval
results of most of the functions above, and often produce accurate results in a
wider variety of conditions. They can also be used to perform hypothesis tests
and generate confidence intervals for custom statistics. This flexibility comes
at the cost of greater computational requirements and stochastic results.
.. autosummary::
:toctree: generated/
monte_carlo_test
permutation_test
bootstrap
power
Instances of the following object can be passed into some hypothesis test
functions to perform a resampling or Monte Carlo version of the hypothesis
test.
.. autosummary::
:toctree: generated/
MonteCarloMethod
PermutationMethod
BootstrapMethod
Multiple Hypothesis Testing and Meta-Analysis
---------------------------------------------
These functions are for assessing the results of individual tests as a whole.
Functions for performing specific multiple hypothesis tests (e.g. post hoc
tests) are listed above.
.. autosummary::
:toctree: generated/
combine_pvalues
false_discovery_control
The following functions are related to the tests above but do not belong in the
above categories.
Quasi-Monte Carlo
=================
.. toctree::
:maxdepth: 4
stats.qmc
Contingency Tables
==================
.. toctree::
:maxdepth: 4
stats.contingency
Masked statistics functions
===========================
.. toctree::
stats.mstats
Other statistical functionality
===============================
Transformations
---------------
.. autosummary::
:toctree: generated/
boxcox
boxcox_normmax
boxcox_llf
yeojohnson
yeojohnson_normmax
yeojohnson_llf
obrientransform
sigmaclip
trimboth
trim1
zmap
zscore
gzscore
Statistical distances
---------------------
.. autosummary::
:toctree: generated/
wasserstein_distance
wasserstein_distance_nd
energy_distance
Sampling
--------
.. toctree::
:maxdepth: 4
stats.sampling
Random variate generation / CDF Inversion
-----------------------------------------
.. autosummary::
:toctree: generated/
rvs_ratio_uniforms
Fitting / Survival Analysis
---------------------------
.. autosummary::
:toctree: generated/
fit
ecdf
logrank
Directional statistical functions
---------------------------------
.. autosummary::
:toctree: generated/
directional_stats
circmean
circvar
circstd
Sensitivity Analysis
--------------------
.. autosummary::
:toctree: generated/
sobol_indices
Plot-tests
----------
.. autosummary::
:toctree: generated/
ppcc_max
ppcc_plot
probplot
boxcox_normplot
yeojohnson_normplot
Univariate and multivariate kernel density estimation
-----------------------------------------------------
.. autosummary::
:toctree: generated/
gaussian_kde
Warnings / Errors used in :mod:`scipy.stats`
--------------------------------------------
.. autosummary::
:toctree: generated/
DegenerateDataWarning
ConstantInputWarning
NearConstantInputWarning
FitError
Result classes used in :mod:`scipy.stats`
-----------------------------------------
.. warning::
These classes are private, but they are included here because instances
of them are returned by other statistical functions. User import and
instantiation is not supported.
.. toctree::
:maxdepth: 2
stats._result_classes
""" # noqa: E501
from ._warnings_errors import (ConstantInputWarning, NearConstantInputWarning,
DegenerateDataWarning, FitError)
from ._stats_py import *
from ._variation import variation
from .distributions import *
from ._morestats import *
from ._multicomp import *
from ._binomtest import binomtest
from ._binned_statistic import *
from ._kde import gaussian_kde
from . import mstats
from . import qmc
from ._multivariate import *
from . import contingency
from .contingency import chi2_contingency
from ._censored_data import CensoredData
from ._resampling import (bootstrap, monte_carlo_test, permutation_test, power,
MonteCarloMethod, PermutationMethod, BootstrapMethod)
from ._entropy import *
from ._hypotests import *
from ._rvs_sampling import rvs_ratio_uniforms
from ._page_trend_test import page_trend_test
from ._mannwhitneyu import mannwhitneyu
from ._bws_test import bws_test
from ._fit import fit, goodness_of_fit
from ._covariance import Covariance
from ._sensitivity_analysis import *
from ._survival import *
from ._mgc import multiscale_graphcorr
# Deprecated namespaces, to be removed in v2.0.0
from . import (
biasedurn, kde, morestats, mstats_basic, mstats_extras, mvn, stats
)
__all__ = [s for s in dir() if not s.startswith("_")] # Remove dunders.
from scipy._lib._testutils import PytestTester
test = PytestTester(__name__)
del PytestTester

View File

@ -0,0 +1,686 @@
# Many scipy.stats functions support `axis` and `nan_policy` parameters.
# When the two are combined, it can be tricky to get all the behavior just
# right. This file contains utility functions useful for scipy.stats functions
# that support `axis` and `nan_policy`, including a decorator that
# automatically adds `axis` and `nan_policy` arguments to a function.
import warnings
import numpy as np
from functools import wraps
from scipy._lib._docscrape import FunctionDoc, Parameter
from scipy._lib._util import _contains_nan, AxisError, _get_nan
from scipy._lib._array_api import array_namespace, is_numpy
import inspect
too_small_1d_not_omit = (
"One or more sample arguments is too small; all "
"returned values will be NaN. "
"See documentation for sample size requirements.")
too_small_1d_omit = (
"After omitting NaNs, one or more sample arguments "
"is too small; all returned values will be NaN. "
"See documentation for sample size requirements.")
too_small_nd_not_omit = (
"All axis-slices of one or more sample arguments are "
"too small; all elements of returned arrays will be NaN. "
"See documentation for sample size requirements.")
too_small_nd_omit = (
"After omitting NaNs, one or more axis-slices of one "
"or more sample arguments is too small; corresponding "
"elements of returned arrays will be NaN. "
"See documentation for sample size requirements.")
class SmallSampleWarning(RuntimeWarning):
pass
def _broadcast_arrays(arrays, axis=None, xp=None):
"""
Broadcast shapes of arrays, ignoring incompatibility of specified axes
"""
if not arrays:
return arrays
xp = array_namespace(*arrays) if xp is None else xp
arrays = [xp.asarray(arr) for arr in arrays]
shapes = [arr.shape for arr in arrays]
new_shapes = _broadcast_shapes(shapes, axis)
if axis is None:
new_shapes = [new_shapes]*len(arrays)
return [xp.broadcast_to(array, new_shape)
for array, new_shape in zip(arrays, new_shapes)]
def _broadcast_shapes(shapes, axis=None):
"""
Broadcast shapes, ignoring incompatibility of specified axes
"""
if not shapes:
return shapes
# input validation
if axis is not None:
axis = np.atleast_1d(axis)
axis_int = axis.astype(int)
if not np.array_equal(axis_int, axis):
raise AxisError('`axis` must be an integer, a '
'tuple of integers, or `None`.')
axis = axis_int
# First, ensure all shapes have same number of dimensions by prepending 1s.
n_dims = max([len(shape) for shape in shapes])
new_shapes = np.ones((len(shapes), n_dims), dtype=int)
for row, shape in zip(new_shapes, shapes):
row[len(row)-len(shape):] = shape # can't use negative indices (-0:)
# Remove the shape elements of the axes to be ignored, but remember them.
if axis is not None:
axis[axis < 0] = n_dims + axis[axis < 0]
axis = np.sort(axis)
if axis[-1] >= n_dims or axis[0] < 0:
message = (f"`axis` is out of bounds "
f"for array of dimension {n_dims}")
raise AxisError(message)
if len(np.unique(axis)) != len(axis):
raise AxisError("`axis` must contain only distinct elements")
removed_shapes = new_shapes[:, axis]
new_shapes = np.delete(new_shapes, axis, axis=1)
# If arrays are broadcastable, shape elements that are 1 may be replaced
# with a corresponding non-1 shape element. Assuming arrays are
# broadcastable, that final shape element can be found with:
new_shape = np.max(new_shapes, axis=0)
# except in case of an empty array:
new_shape *= new_shapes.all(axis=0)
# Among all arrays, there can only be one unique non-1 shape element.
# Therefore, if any non-1 shape element does not match what we found
# above, the arrays must not be broadcastable after all.
if np.any(~((new_shapes == 1) | (new_shapes == new_shape))):
raise ValueError("Array shapes are incompatible for broadcasting.")
if axis is not None:
# Add back the shape elements that were ignored
new_axis = axis - np.arange(len(axis))
new_shapes = [tuple(np.insert(new_shape, new_axis, removed_shape))
for removed_shape in removed_shapes]
return new_shapes
else:
return tuple(new_shape)
def _broadcast_array_shapes_remove_axis(arrays, axis=None):
"""
Broadcast shapes of arrays, dropping specified axes
Given a sequence of arrays `arrays` and an integer or tuple `axis`, find
the shape of the broadcast result after consuming/dropping `axis`.
In other words, return output shape of a typical hypothesis test on
`arrays` vectorized along `axis`.
Examples
--------
>>> import numpy as np
>>> from scipy.stats._axis_nan_policy import _broadcast_array_shapes_remove_axis
>>> a = np.zeros((5, 2, 1))
>>> b = np.zeros((9, 3))
>>> _broadcast_array_shapes_remove_axis((a, b), 1)
(5, 3)
"""
# Note that here, `axis=None` means do not consume/drop any axes - _not_
# ravel arrays before broadcasting.
shapes = [arr.shape for arr in arrays]
return _broadcast_shapes_remove_axis(shapes, axis)
def _broadcast_shapes_remove_axis(shapes, axis=None):
"""
Broadcast shapes, dropping specified axes
Same as _broadcast_array_shapes_remove_axis, but given a sequence
of array shapes `shapes` instead of the arrays themselves.
"""
shapes = _broadcast_shapes(shapes, axis)
shape = shapes[0]
if axis is not None:
shape = np.delete(shape, axis)
return tuple(shape)
def _broadcast_concatenate(arrays, axis, paired=False):
"""Concatenate arrays along an axis with broadcasting."""
arrays = _broadcast_arrays(arrays, axis if not paired else None)
res = np.concatenate(arrays, axis=axis)
return res
# TODO: add support for `axis` tuples
def _remove_nans(samples, paired):
"Remove nans from paired or unpaired 1D samples"
# potential optimization: don't copy arrays that don't contain nans
if not paired:
return [sample[~np.isnan(sample)] for sample in samples]
# for paired samples, we need to remove the whole pair when any part
# has a nan
nans = np.isnan(samples[0])
for sample in samples[1:]:
nans = nans | np.isnan(sample)
not_nans = ~nans
return [sample[not_nans] for sample in samples]
def _remove_sentinel(samples, paired, sentinel):
"Remove sentinel values from paired or unpaired 1D samples"
# could consolidate with `_remove_nans`, but it's not quite as simple as
# passing `sentinel=np.nan` because `(np.nan == np.nan) is False`
# potential optimization: don't copy arrays that don't contain sentinel
if not paired:
return [sample[sample != sentinel] for sample in samples]
# for paired samples, we need to remove the whole pair when any part
# has a nan
sentinels = (samples[0] == sentinel)
for sample in samples[1:]:
sentinels = sentinels | (sample == sentinel)
not_sentinels = ~sentinels
return [sample[not_sentinels] for sample in samples]
def _masked_arrays_2_sentinel_arrays(samples):
# masked arrays in `samples` are converted to regular arrays, and values
# corresponding with masked elements are replaced with a sentinel value
# return without modifying arrays if none have a mask
has_mask = False
for sample in samples:
mask = getattr(sample, 'mask', False)
has_mask = has_mask or np.any(mask)
if not has_mask:
return samples, None # None means there is no sentinel value
# Choose a sentinel value. We can't use `np.nan`, because sentinel (masked)
# values are always omitted, but there are different nan policies.
dtype = np.result_type(*samples)
dtype = dtype if np.issubdtype(dtype, np.number) else np.float64
for i in range(len(samples)):
# Things get more complicated if the arrays are of different types.
# We could have different sentinel values for each array, but
# the purpose of this code is convenience, not efficiency.
samples[i] = samples[i].astype(dtype, copy=False)
inexact = np.issubdtype(dtype, np.inexact)
info = np.finfo if inexact else np.iinfo
max_possible, min_possible = info(dtype).max, info(dtype).min
nextafter = np.nextafter if inexact else (lambda x, _: x - 1)
sentinel = max_possible
# For simplicity, min_possible/np.infs are not candidate sentinel values
while sentinel > min_possible:
for sample in samples:
if np.any(sample == sentinel): # choose a new sentinel value
sentinel = nextafter(sentinel, -np.inf)
break
else: # when sentinel value is OK, break the while loop
break
else:
message = ("This function replaces masked elements with sentinel "
"values, but the data contains all distinct values of this "
"data type. Consider promoting the dtype to `np.float64`.")
raise ValueError(message)
# replace masked elements with sentinel value
out_samples = []
for sample in samples:
mask = getattr(sample, 'mask', None)
if mask is not None: # turn all masked arrays into sentinel arrays
mask = np.broadcast_to(mask, sample.shape)
sample = sample.data.copy() if np.any(mask) else sample.data
sample = np.asarray(sample) # `sample.data` could be a memoryview?
sample[mask] = sentinel
out_samples.append(sample)
return out_samples, sentinel
def _check_empty_inputs(samples, axis):
"""
Check for empty sample; return appropriate output for a vectorized hypotest
"""
# if none of the samples are empty, we need to perform the test
if not any(sample.size == 0 for sample in samples):
return None
# otherwise, the statistic and p-value will be either empty arrays or
# arrays with NaNs. Produce the appropriate array and return it.
output_shape = _broadcast_array_shapes_remove_axis(samples, axis)
output = np.ones(output_shape) * _get_nan(*samples)
return output
def _add_reduced_axes(res, reduced_axes, keepdims):
"""
Add reduced axes back to all the arrays in the result object
if keepdims = True.
"""
return ([np.expand_dims(output, reduced_axes)
if not isinstance(output, int) else output for output in res]
if keepdims else res)
# Standard docstring / signature entries for `axis`, `nan_policy`, `keepdims`
_name = 'axis'
_desc = (
"""If an int, the axis of the input along which to compute the statistic.
The statistic of each axis-slice (e.g. row) of the input will appear in a
corresponding element of the output.
If ``None``, the input will be raveled before computing the statistic."""
.split('\n'))
def _get_axis_params(default_axis=0, _name=_name, _desc=_desc): # bind NOW
_type = f"int or None, default: {default_axis}"
_axis_parameter_doc = Parameter(_name, _type, _desc)
_axis_parameter = inspect.Parameter(_name,
inspect.Parameter.KEYWORD_ONLY,
default=default_axis)
return _axis_parameter_doc, _axis_parameter
_name = 'nan_policy'
_type = "{'propagate', 'omit', 'raise'}"
_desc = (
"""Defines how to handle input NaNs.
- ``propagate``: if a NaN is present in the axis slice (e.g. row) along
which the statistic is computed, the corresponding entry of the output
will be NaN.
- ``omit``: NaNs will be omitted when performing the calculation.
If insufficient data remains in the axis slice along which the
statistic is computed, the corresponding entry of the output will be
NaN.
- ``raise``: if a NaN is present, a ``ValueError`` will be raised."""
.split('\n'))
_nan_policy_parameter_doc = Parameter(_name, _type, _desc)
_nan_policy_parameter = inspect.Parameter(_name,
inspect.Parameter.KEYWORD_ONLY,
default='propagate')
_name = 'keepdims'
_type = "bool, default: False"
_desc = (
"""If this is set to True, the axes which are reduced are left
in the result as dimensions with size one. With this option,
the result will broadcast correctly against the input array."""
.split('\n'))
_keepdims_parameter_doc = Parameter(_name, _type, _desc)
_keepdims_parameter = inspect.Parameter(_name,
inspect.Parameter.KEYWORD_ONLY,
default=False)
_standard_note_addition = (
"""\nBeginning in SciPy 1.9, ``np.matrix`` inputs (not recommended for new
code) are converted to ``np.ndarray`` before the calculation is performed. In
this case, the output will be a scalar or ``np.ndarray`` of appropriate shape
rather than a 2D ``np.matrix``. Similarly, while masked elements of masked
arrays are ignored, the output will be a scalar or ``np.ndarray`` rather than a
masked array with ``mask=False``.""").split('\n')
def _axis_nan_policy_factory(tuple_to_result, default_axis=0,
n_samples=1, paired=False,
result_to_tuple=None, too_small=0,
n_outputs=2, kwd_samples=[], override=None):
"""Factory for a wrapper that adds axis/nan_policy params to a function.
Parameters
----------
tuple_to_result : callable
Callable that returns an object of the type returned by the function
being wrapped (e.g. the namedtuple or dataclass returned by a
statistical test) provided the separate components (e.g. statistic,
pvalue).
default_axis : int, default: 0
The default value of the axis argument. Standard is 0 except when
backwards compatibility demands otherwise (e.g. `None`).
n_samples : int or callable, default: 1
The number of data samples accepted by the function
(e.g. `mannwhitneyu`), a callable that accepts a dictionary of
parameters passed into the function and returns the number of data
samples (e.g. `wilcoxon`), or `None` to indicate an arbitrary number
of samples (e.g. `kruskal`).
paired : {False, True}
Whether the function being wrapped treats the samples as paired (i.e.
corresponding elements of each sample should be considered as different
components of the same sample.)
result_to_tuple : callable, optional
Function that unpacks the results of the function being wrapped into
a tuple. This is essentially the inverse of `tuple_to_result`. Default
is `None`, which is appropriate for statistical tests that return a
statistic, pvalue tuple (rather than, e.g., a non-iterable datalass).
too_small : int or callable, default: 0
The largest unnacceptably small sample for the function being wrapped.
For example, some functions require samples of size two or more or they
raise an error. This argument prevents the error from being raised when
input is not 1D and instead places a NaN in the corresponding element
of the result. If callable, it must accept a list of samples, axis,
and a dictionary of keyword arguments passed to the wrapper function as
arguments and return a bool indicating weather the samples passed are
too small.
n_outputs : int or callable, default: 2
The number of outputs produced by the function given 1d sample(s). For
example, hypothesis tests that return a namedtuple or result object
with attributes ``statistic`` and ``pvalue`` use the default
``n_outputs=2``; summary statistics with scalar output use
``n_outputs=1``. Alternatively, may be a callable that accepts a
dictionary of arguments passed into the wrapped function and returns
the number of outputs corresponding with those arguments.
kwd_samples : sequence, default: []
The names of keyword parameters that should be treated as samples. For
example, `gmean` accepts as its first argument a sample `a` but
also `weights` as a fourth, optional keyword argument. In this case, we
use `n_samples=1` and kwd_samples=['weights'].
override : dict, default: {'vectorization': False, 'nan_propagation': True}
Pass a dictionary with ``'vectorization': True`` to ensure that the
decorator overrides the function's behavior for multimensional input.
Use ``'nan_propagation': False`` to ensure that the decorator does not
override the function's behavior for ``nan_policy='propagate'``.
"""
# Specify which existing behaviors the decorator must override
temp = override or {}
override = {'vectorization': False,
'nan_propagation': True}
override.update(temp)
if result_to_tuple is None:
def result_to_tuple(res):
return res
if not callable(too_small):
def is_too_small(samples, *ts_args, axis=-1, **ts_kwargs):
for sample in samples:
if sample.shape[axis] <= too_small:
return True
return False
else:
is_too_small = too_small
def axis_nan_policy_decorator(hypotest_fun_in):
@wraps(hypotest_fun_in)
def axis_nan_policy_wrapper(*args, _no_deco=False, **kwds):
if _no_deco: # for testing, decorator does nothing
return hypotest_fun_in(*args, **kwds)
# For now, skip the decorator entirely if using array API. In the future,
# we'll probably want to use it for `keepdims`, `axis` tuples, etc.
if len(args) == 0: # extract sample from `kwds` if there are no `args`
used_kwd_samples = list(set(kwds).intersection(set(kwd_samples)))
temp = used_kwd_samples[:1]
else:
temp = args[0]
if not is_numpy(array_namespace(temp)):
msg = ("Use of `nan_policy` and `keepdims` "
"is incompatible with non-NumPy arrays.")
if 'nan_policy' in kwds or 'keepdims' in kwds:
raise NotImplementedError(msg)
return hypotest_fun_in(*args, **kwds)
# We need to be flexible about whether position or keyword
# arguments are used, but we need to make sure users don't pass
# both for the same parameter. To complicate matters, some
# functions accept samples with *args, and some functions already
# accept `axis` and `nan_policy` as positional arguments.
# The strategy is to make sure that there is no duplication
# between `args` and `kwds`, combine the two into `kwds`, then
# the samples, `nan_policy`, and `axis` from `kwds`, as they are
# dealt with separately.
# Check for intersection between positional and keyword args
params = list(inspect.signature(hypotest_fun_in).parameters)
if n_samples is None:
# Give unique names to each positional sample argument
# Note that *args can't be provided as a keyword argument
params = [f"arg{i}" for i in range(len(args))] + params[1:]
# raise if there are too many positional args
maxarg = (np.inf if inspect.getfullargspec(hypotest_fun_in).varargs
else len(inspect.getfullargspec(hypotest_fun_in).args))
if len(args) > maxarg: # let the function raise the right error
hypotest_fun_in(*args, **kwds)
# raise if multiple values passed for same parameter
d_args = dict(zip(params, args))
intersection = set(d_args) & set(kwds)
if intersection: # let the function raise the right error
hypotest_fun_in(*args, **kwds)
# Consolidate other positional and keyword args into `kwds`
kwds.update(d_args)
# rename avoids UnboundLocalError
if callable(n_samples):
# Future refactoring idea: no need for callable n_samples.
# Just replace `n_samples` and `kwd_samples` with a single
# list of the names of all samples, and treat all of them
# as `kwd_samples` are treated below.
n_samp = n_samples(kwds)
else:
n_samp = n_samples or len(args)
# get the number of outputs
n_out = n_outputs # rename to avoid UnboundLocalError
if callable(n_out):
n_out = n_out(kwds)
# If necessary, rearrange function signature: accept other samples
# as positional args right after the first n_samp args
kwd_samp = [name for name in kwd_samples
if kwds.get(name, None) is not None]
n_kwd_samp = len(kwd_samp)
if not kwd_samp:
hypotest_fun_out = hypotest_fun_in
else:
def hypotest_fun_out(*samples, **kwds):
new_kwds = dict(zip(kwd_samp, samples[n_samp:]))
kwds.update(new_kwds)
return hypotest_fun_in(*samples[:n_samp], **kwds)
# Extract the things we need here
try: # if something is missing
samples = [np.atleast_1d(kwds.pop(param))
for param in (params[:n_samp] + kwd_samp)]
except KeyError: # let the function raise the right error
# might need to revisit this if required arg is not a "sample"
hypotest_fun_in(*args, **kwds)
vectorized = True if 'axis' in params else False
vectorized = vectorized and not override['vectorization']
axis = kwds.pop('axis', default_axis)
nan_policy = kwds.pop('nan_policy', 'propagate')
keepdims = kwds.pop("keepdims", False)
del args # avoid the possibility of passing both `args` and `kwds`
# convert masked arrays to regular arrays with sentinel values
samples, sentinel = _masked_arrays_2_sentinel_arrays(samples)
# standardize to always work along last axis
reduced_axes = axis
if axis is None:
if samples:
# when axis=None, take the maximum of all dimensions since
# all the dimensions are reduced.
n_dims = np.max([sample.ndim for sample in samples])
reduced_axes = tuple(range(n_dims))
samples = [np.asarray(sample.ravel()) for sample in samples]
else:
samples = _broadcast_arrays(samples, axis=axis)
axis = np.atleast_1d(axis)
n_axes = len(axis)
# move all axes in `axis` to the end to be raveled
samples = [np.moveaxis(sample, axis, range(-len(axis), 0))
for sample in samples]
shapes = [sample.shape for sample in samples]
# New shape is unchanged for all axes _not_ in `axis`
# At the end, we append the product of the shapes of the axes
# in `axis`. Appending -1 doesn't work for zero-size arrays!
new_shapes = [shape[:-n_axes] + (np.prod(shape[-n_axes:]),)
for shape in shapes]
samples = [sample.reshape(new_shape)
for sample, new_shape in zip(samples, new_shapes)]
axis = -1 # work over the last axis
NaN = _get_nan(*samples) if samples else np.nan
# if axis is not needed, just handle nan_policy and return
ndims = np.array([sample.ndim for sample in samples])
if np.all(ndims <= 1):
# Addresses nan_policy == "raise"
if nan_policy != 'propagate' or override['nan_propagation']:
contains_nan = [_contains_nan(sample, nan_policy)[0]
for sample in samples]
else:
# Behave as though there are no NaNs (even if there are)
contains_nan = [False]*len(samples)
# Addresses nan_policy == "propagate"
if any(contains_nan) and (nan_policy == 'propagate'
and override['nan_propagation']):
res = np.full(n_out, NaN)
res = _add_reduced_axes(res, reduced_axes, keepdims)
return tuple_to_result(*res)
# Addresses nan_policy == "omit"
too_small_msg = too_small_1d_not_omit
if any(contains_nan) and nan_policy == 'omit':
# consider passing in contains_nan
samples = _remove_nans(samples, paired)
too_small_msg = too_small_1d_omit
if sentinel:
samples = _remove_sentinel(samples, paired, sentinel)
if is_too_small(samples, kwds):
warnings.warn(too_small_msg, SmallSampleWarning, stacklevel=2)
res = np.full(n_out, NaN)
res = _add_reduced_axes(res, reduced_axes, keepdims)
return tuple_to_result(*res)
res = hypotest_fun_out(*samples, **kwds)
res = result_to_tuple(res)
res = _add_reduced_axes(res, reduced_axes, keepdims)
return tuple_to_result(*res)
# check for empty input
empty_output = _check_empty_inputs(samples, axis)
# only return empty output if zero sized input is too small.
if (
empty_output is not None
and (is_too_small(samples, kwds) or empty_output.size == 0)
):
if is_too_small(samples, kwds) and empty_output.size != 0:
warnings.warn(too_small_nd_not_omit, SmallSampleWarning,
stacklevel=2)
res = [empty_output.copy() for i in range(n_out)]
res = _add_reduced_axes(res, reduced_axes, keepdims)
return tuple_to_result(*res)
# otherwise, concatenate all samples along axis, remembering where
# each separate sample begins
lengths = np.array([sample.shape[axis] for sample in samples])
split_indices = np.cumsum(lengths)
x = _broadcast_concatenate(samples, axis)
# Addresses nan_policy == "raise"
if nan_policy != 'propagate' or override['nan_propagation']:
contains_nan, _ = _contains_nan(x, nan_policy)
else:
contains_nan = False # behave like there are no NaNs
if vectorized and not contains_nan and not sentinel:
res = hypotest_fun_out(*samples, axis=axis, **kwds)
res = result_to_tuple(res)
res = _add_reduced_axes(res, reduced_axes, keepdims)
return tuple_to_result(*res)
# Addresses nan_policy == "omit"
if contains_nan and nan_policy == 'omit':
def hypotest_fun(x):
samples = np.split(x, split_indices)[:n_samp+n_kwd_samp]
samples = _remove_nans(samples, paired)
if sentinel:
samples = _remove_sentinel(samples, paired, sentinel)
if is_too_small(samples, kwds):
warnings.warn(too_small_nd_omit, SmallSampleWarning,
stacklevel=4)
return np.full(n_out, NaN)
return result_to_tuple(hypotest_fun_out(*samples, **kwds))
# Addresses nan_policy == "propagate"
elif (contains_nan and nan_policy == 'propagate'
and override['nan_propagation']):
def hypotest_fun(x):
if np.isnan(x).any():
return np.full(n_out, NaN)
samples = np.split(x, split_indices)[:n_samp+n_kwd_samp]
if sentinel:
samples = _remove_sentinel(samples, paired, sentinel)
if is_too_small(samples, kwds):
return np.full(n_out, NaN)
return result_to_tuple(hypotest_fun_out(*samples, **kwds))
else:
def hypotest_fun(x):
samples = np.split(x, split_indices)[:n_samp+n_kwd_samp]
if sentinel:
samples = _remove_sentinel(samples, paired, sentinel)
if is_too_small(samples, kwds):
return np.full(n_out, NaN)
return result_to_tuple(hypotest_fun_out(*samples, **kwds))
x = np.moveaxis(x, axis, 0)
res = np.apply_along_axis(hypotest_fun, axis=0, arr=x)
res = _add_reduced_axes(res, reduced_axes, keepdims)
return tuple_to_result(*res)
_axis_parameter_doc, _axis_parameter = _get_axis_params(default_axis)
doc = FunctionDoc(axis_nan_policy_wrapper)
parameter_names = [param.name for param in doc['Parameters']]
if 'axis' in parameter_names:
doc['Parameters'][parameter_names.index('axis')] = (
_axis_parameter_doc)
else:
doc['Parameters'].append(_axis_parameter_doc)
if 'nan_policy' in parameter_names:
doc['Parameters'][parameter_names.index('nan_policy')] = (
_nan_policy_parameter_doc)
else:
doc['Parameters'].append(_nan_policy_parameter_doc)
if 'keepdims' in parameter_names:
doc['Parameters'][parameter_names.index('keepdims')] = (
_keepdims_parameter_doc)
else:
doc['Parameters'].append(_keepdims_parameter_doc)
doc['Notes'] += _standard_note_addition
doc = str(doc).split("\n", 1)[1] # remove signature
axis_nan_policy_wrapper.__doc__ = str(doc)
sig = inspect.signature(axis_nan_policy_wrapper)
parameters = sig.parameters
parameter_list = list(parameters.values())
if 'axis' not in parameters:
parameter_list.append(_axis_parameter)
if 'nan_policy' not in parameters:
parameter_list.append(_nan_policy_parameter)
if 'keepdims' not in parameters:
parameter_list.append(_keepdims_parameter)
sig = sig.replace(parameters=parameter_list)
axis_nan_policy_wrapper.__signature__ = sig
return axis_nan_policy_wrapper
return axis_nan_policy_decorator

View File

@ -0,0 +1,27 @@
# Declare the class with cdef
cdef extern from "biasedurn/stocc.h" nogil:
cdef cppclass CFishersNCHypergeometric:
CFishersNCHypergeometric(int, int, int, double, double) except +
int mode()
double mean()
double variance()
double probability(int x)
double moments(double * mean, double * var)
cdef cppclass CWalleniusNCHypergeometric:
CWalleniusNCHypergeometric() except +
CWalleniusNCHypergeometric(int, int, int, double, double) except +
int mode()
double mean()
double variance()
double probability(int x)
double moments(double * mean, double * var)
cdef cppclass StochasticLib3:
StochasticLib3(int seed) except +
double Random() except +
void SetAccuracy(double accur)
int FishersNCHyp (int n, int m, int N, double odds) except +
int WalleniusNCHyp (int n, int m, int N, double odds) except +
double(*next_double)()
double(*next_normal)(const double m, const double s)

View File

@ -0,0 +1,795 @@
import builtins
from warnings import catch_warnings, simplefilter
import numpy as np
from operator import index
from collections import namedtuple
__all__ = ['binned_statistic',
'binned_statistic_2d',
'binned_statistic_dd']
BinnedStatisticResult = namedtuple('BinnedStatisticResult',
('statistic', 'bin_edges', 'binnumber'))
def binned_statistic(x, values, statistic='mean',
bins=10, range=None):
"""
Compute a binned statistic for one or more sets of data.
This is a generalization of a histogram function. A histogram divides
the space into bins, and returns the count of the number of points in
each bin. This function allows the computation of the sum, mean, median,
or other statistic of the values (or set of values) within each bin.
Parameters
----------
x : (N,) array_like
A sequence of values to be binned.
values : (N,) array_like or list of (N,) array_like
The data on which the statistic will be computed. This must be
the same shape as `x`, or a set of sequences - each the same shape as
`x`. If `values` is a set of sequences, the statistic will be computed
on each independently.
statistic : string or callable, optional
The statistic to compute (default is 'mean').
The following statistics are available:
* 'mean' : compute the mean of values for points within each bin.
Empty bins will be represented by NaN.
* 'std' : compute the standard deviation within each bin. This
is implicitly calculated with ddof=0.
* 'median' : compute the median of values for points within each
bin. Empty bins will be represented by NaN.
* 'count' : compute the count of points within each bin. This is
identical to an unweighted histogram. `values` array is not
referenced.
* 'sum' : compute the sum of values for points within each bin.
This is identical to a weighted histogram.
* 'min' : compute the minimum of values for points within each bin.
Empty bins will be represented by NaN.
* 'max' : compute the maximum of values for point within each bin.
Empty bins will be represented by NaN.
* function : a user-defined function which takes a 1D array of
values, and outputs a single numerical statistic. This function
will be called on the values in each bin. Empty bins will be
represented by function([]), or NaN if this returns an error.
bins : int or sequence of scalars, optional
If `bins` is an int, it defines the number of equal-width bins in the
given range (10 by default). If `bins` is a sequence, it defines the
bin edges, including the rightmost edge, allowing for non-uniform bin
widths. Values in `x` that are smaller than lowest bin edge are
assigned to bin number 0, values beyond the highest bin are assigned to
``bins[-1]``. If the bin edges are specified, the number of bins will
be, (nx = len(bins)-1).
range : (float, float) or [(float, float)], optional
The lower and upper range of the bins. If not provided, range
is simply ``(x.min(), x.max())``. Values outside the range are
ignored.
Returns
-------
statistic : array
The values of the selected statistic in each bin.
bin_edges : array of dtype float
Return the bin edges ``(length(statistic)+1)``.
binnumber: 1-D ndarray of ints
Indices of the bins (corresponding to `bin_edges`) in which each value
of `x` belongs. Same length as `values`. A binnumber of `i` means the
corresponding value is between (bin_edges[i-1], bin_edges[i]).
See Also
--------
numpy.digitize, numpy.histogram, binned_statistic_2d, binned_statistic_dd
Notes
-----
All but the last (righthand-most) bin is half-open. In other words, if
`bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
but excluding 2) and the second ``[2, 3)``. The last bin, however, is
``[3, 4]``, which *includes* 4.
.. versionadded:: 0.11.0
Examples
--------
>>> import numpy as np
>>> from scipy import stats
>>> import matplotlib.pyplot as plt
First some basic examples:
Create two evenly spaced bins in the range of the given sample, and sum the
corresponding values in each of those bins:
>>> values = [1.0, 1.0, 2.0, 1.5, 3.0]
>>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
BinnedStatisticResult(statistic=array([4. , 4.5]),
bin_edges=array([1., 4., 7.]), binnumber=array([1, 1, 1, 2, 2]))
Multiple arrays of values can also be passed. The statistic is calculated
on each set independently:
>>> values = [[1.0, 1.0, 2.0, 1.5, 3.0], [2.0, 2.0, 4.0, 3.0, 6.0]]
>>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
BinnedStatisticResult(statistic=array([[4. , 4.5],
[8. , 9. ]]), bin_edges=array([1., 4., 7.]),
binnumber=array([1, 1, 1, 2, 2]))
>>> stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean',
... bins=3)
BinnedStatisticResult(statistic=array([1., 2., 4.]),
bin_edges=array([1., 2., 3., 4.]),
binnumber=array([1, 2, 1, 2, 3]))
As a second example, we now generate some random data of sailing boat speed
as a function of wind speed, and then determine how fast our boat is for
certain wind speeds:
>>> rng = np.random.default_rng()
>>> windspeed = 8 * rng.random(500)
>>> boatspeed = .3 * windspeed**.5 + .2 * rng.random(500)
>>> bin_means, bin_edges, binnumber = stats.binned_statistic(windspeed,
... boatspeed, statistic='median', bins=[1,2,3,4,5,6,7])
>>> plt.figure()
>>> plt.plot(windspeed, boatspeed, 'b.', label='raw data')
>>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=5,
... label='binned statistic of data')
>>> plt.legend()
Now we can use ``binnumber`` to select all datapoints with a windspeed
below 1:
>>> low_boatspeed = boatspeed[binnumber == 0]
As a final example, we will use ``bin_edges`` and ``binnumber`` to make a
plot of a distribution that shows the mean and distribution around that
mean per bin, on top of a regular histogram and the probability
distribution function:
>>> x = np.linspace(0, 5, num=500)
>>> x_pdf = stats.maxwell.pdf(x)
>>> samples = stats.maxwell.rvs(size=10000)
>>> bin_means, bin_edges, binnumber = stats.binned_statistic(x, x_pdf,
... statistic='mean', bins=25)
>>> bin_width = (bin_edges[1] - bin_edges[0])
>>> bin_centers = bin_edges[1:] - bin_width/2
>>> plt.figure()
>>> plt.hist(samples, bins=50, density=True, histtype='stepfilled',
... alpha=0.2, label='histogram of data')
>>> plt.plot(x, x_pdf, 'r-', label='analytical pdf')
>>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=2,
... label='binned statistic of data')
>>> plt.plot((binnumber - 0.5) * bin_width, x_pdf, 'g.', alpha=0.5)
>>> plt.legend(fontsize=10)
>>> plt.show()
"""
try:
N = len(bins)
except TypeError:
N = 1
if N != 1:
bins = [np.asarray(bins, float)]
if range is not None:
if len(range) == 2:
range = [range]
medians, edges, binnumbers = binned_statistic_dd(
[x], values, statistic, bins, range)
return BinnedStatisticResult(medians, edges[0], binnumbers)
BinnedStatistic2dResult = namedtuple('BinnedStatistic2dResult',
('statistic', 'x_edge', 'y_edge',
'binnumber'))
def binned_statistic_2d(x, y, values, statistic='mean',
bins=10, range=None, expand_binnumbers=False):
"""
Compute a bidimensional binned statistic for one or more sets of data.
This is a generalization of a histogram2d function. A histogram divides
the space into bins, and returns the count of the number of points in
each bin. This function allows the computation of the sum, mean, median,
or other statistic of the values (or set of values) within each bin.
Parameters
----------
x : (N,) array_like
A sequence of values to be binned along the first dimension.
y : (N,) array_like
A sequence of values to be binned along the second dimension.
values : (N,) array_like or list of (N,) array_like
The data on which the statistic will be computed. This must be
the same shape as `x`, or a list of sequences - each with the same
shape as `x`. If `values` is such a list, the statistic will be
computed on each independently.
statistic : string or callable, optional
The statistic to compute (default is 'mean').
The following statistics are available:
* 'mean' : compute the mean of values for points within each bin.
Empty bins will be represented by NaN.
* 'std' : compute the standard deviation within each bin. This
is implicitly calculated with ddof=0.
* 'median' : compute the median of values for points within each
bin. Empty bins will be represented by NaN.
* 'count' : compute the count of points within each bin. This is
identical to an unweighted histogram. `values` array is not
referenced.
* 'sum' : compute the sum of values for points within each bin.
This is identical to a weighted histogram.
* 'min' : compute the minimum of values for points within each bin.
Empty bins will be represented by NaN.
* 'max' : compute the maximum of values for point within each bin.
Empty bins will be represented by NaN.
* function : a user-defined function which takes a 1D array of
values, and outputs a single numerical statistic. This function
will be called on the values in each bin. Empty bins will be
represented by function([]), or NaN if this returns an error.
bins : int or [int, int] or array_like or [array, array], optional
The bin specification:
* the number of bins for the two dimensions (nx = ny = bins),
* the number of bins in each dimension (nx, ny = bins),
* the bin edges for the two dimensions (x_edge = y_edge = bins),
* the bin edges in each dimension (x_edge, y_edge = bins).
If the bin edges are specified, the number of bins will be,
(nx = len(x_edge)-1, ny = len(y_edge)-1).
range : (2,2) array_like, optional
The leftmost and rightmost edges of the bins along each dimension
(if not specified explicitly in the `bins` parameters):
[[xmin, xmax], [ymin, ymax]]. All values outside of this range will be
considered outliers and not tallied in the histogram.
expand_binnumbers : bool, optional
'False' (default): the returned `binnumber` is a shape (N,) array of
linearized bin indices.
'True': the returned `binnumber` is 'unraveled' into a shape (2,N)
ndarray, where each row gives the bin numbers in the corresponding
dimension.
See the `binnumber` returned value, and the `Examples` section.
.. versionadded:: 0.17.0
Returns
-------
statistic : (nx, ny) ndarray
The values of the selected statistic in each two-dimensional bin.
x_edge : (nx + 1) ndarray
The bin edges along the first dimension.
y_edge : (ny + 1) ndarray
The bin edges along the second dimension.
binnumber : (N,) array of ints or (2,N) ndarray of ints
This assigns to each element of `sample` an integer that represents the
bin in which this observation falls. The representation depends on the
`expand_binnumbers` argument. See `Notes` for details.
See Also
--------
numpy.digitize, numpy.histogram2d, binned_statistic, binned_statistic_dd
Notes
-----
Binedges:
All but the last (righthand-most) bin is half-open. In other words, if
`bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
but excluding 2) and the second ``[2, 3)``. The last bin, however, is
``[3, 4]``, which *includes* 4.
`binnumber`:
This returned argument assigns to each element of `sample` an integer that
represents the bin in which it belongs. The representation depends on the
`expand_binnumbers` argument. If 'False' (default): The returned
`binnumber` is a shape (N,) array of linearized indices mapping each
element of `sample` to its corresponding bin (using row-major ordering).
Note that the returned linearized bin indices are used for an array with
extra bins on the outer binedges to capture values outside of the defined
bin bounds.
If 'True': The returned `binnumber` is a shape (2,N) ndarray where
each row indicates bin placements for each dimension respectively. In each
dimension, a binnumber of `i` means the corresponding value is between
(D_edge[i-1], D_edge[i]), where 'D' is either 'x' or 'y'.
.. versionadded:: 0.11.0
Examples
--------
>>> from scipy import stats
Calculate the counts with explicit bin-edges:
>>> x = [0.1, 0.1, 0.1, 0.6]
>>> y = [2.1, 2.6, 2.1, 2.1]
>>> binx = [0.0, 0.5, 1.0]
>>> biny = [2.0, 2.5, 3.0]
>>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny])
>>> ret.statistic
array([[2., 1.],
[1., 0.]])
The bin in which each sample is placed is given by the `binnumber`
returned parameter. By default, these are the linearized bin indices:
>>> ret.binnumber
array([5, 6, 5, 9])
The bin indices can also be expanded into separate entries for each
dimension using the `expand_binnumbers` parameter:
>>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny],
... expand_binnumbers=True)
>>> ret.binnumber
array([[1, 1, 1, 2],
[1, 2, 1, 1]])
Which shows that the first three elements belong in the xbin 1, and the
fourth into xbin 2; and so on for y.
"""
# This code is based on np.histogram2d
try:
N = len(bins)
except TypeError:
N = 1
if N != 1 and N != 2:
xedges = yedges = np.asarray(bins, float)
bins = [xedges, yedges]
medians, edges, binnumbers = binned_statistic_dd(
[x, y], values, statistic, bins, range,
expand_binnumbers=expand_binnumbers)
return BinnedStatistic2dResult(medians, edges[0], edges[1], binnumbers)
BinnedStatisticddResult = namedtuple('BinnedStatisticddResult',
('statistic', 'bin_edges',
'binnumber'))
def _bincount(x, weights):
if np.iscomplexobj(weights):
a = np.bincount(x, np.real(weights))
b = np.bincount(x, np.imag(weights))
z = a + b*1j
else:
z = np.bincount(x, weights)
return z
def binned_statistic_dd(sample, values, statistic='mean',
bins=10, range=None, expand_binnumbers=False,
binned_statistic_result=None):
"""
Compute a multidimensional binned statistic for a set of data.
This is a generalization of a histogramdd function. A histogram divides
the space into bins, and returns the count of the number of points in
each bin. This function allows the computation of the sum, mean, median,
or other statistic of the values within each bin.
Parameters
----------
sample : array_like
Data to histogram passed as a sequence of N arrays of length D, or
as an (N,D) array.
values : (N,) array_like or list of (N,) array_like
The data on which the statistic will be computed. This must be
the same shape as `sample`, or a list of sequences - each with the
same shape as `sample`. If `values` is such a list, the statistic
will be computed on each independently.
statistic : string or callable, optional
The statistic to compute (default is 'mean').
The following statistics are available:
* 'mean' : compute the mean of values for points within each bin.
Empty bins will be represented by NaN.
* 'median' : compute the median of values for points within each
bin. Empty bins will be represented by NaN.
* 'count' : compute the count of points within each bin. This is
identical to an unweighted histogram. `values` array is not
referenced.
* 'sum' : compute the sum of values for points within each bin.
This is identical to a weighted histogram.
* 'std' : compute the standard deviation within each bin. This
is implicitly calculated with ddof=0. If the number of values
within a given bin is 0 or 1, the computed standard deviation value
will be 0 for the bin.
* 'min' : compute the minimum of values for points within each bin.
Empty bins will be represented by NaN.
* 'max' : compute the maximum of values for point within each bin.
Empty bins will be represented by NaN.
* function : a user-defined function which takes a 1D array of
values, and outputs a single numerical statistic. This function
will be called on the values in each bin. Empty bins will be
represented by function([]), or NaN if this returns an error.
bins : sequence or positive int, optional
The bin specification must be in one of the following forms:
* A sequence of arrays describing the bin edges along each dimension.
* The number of bins for each dimension (nx, ny, ... = bins).
* The number of bins for all dimensions (nx = ny = ... = bins).
range : sequence, optional
A sequence of lower and upper bin edges to be used if the edges are
not given explicitly in `bins`. Defaults to the minimum and maximum
values along each dimension.
expand_binnumbers : bool, optional
'False' (default): the returned `binnumber` is a shape (N,) array of
linearized bin indices.
'True': the returned `binnumber` is 'unraveled' into a shape (D,N)
ndarray, where each row gives the bin numbers in the corresponding
dimension.
See the `binnumber` returned value, and the `Examples` section of
`binned_statistic_2d`.
binned_statistic_result : binnedStatisticddResult
Result of a previous call to the function in order to reuse bin edges
and bin numbers with new values and/or a different statistic.
To reuse bin numbers, `expand_binnumbers` must have been set to False
(the default)
.. versionadded:: 0.17.0
Returns
-------
statistic : ndarray, shape(nx1, nx2, nx3,...)
The values of the selected statistic in each two-dimensional bin.
bin_edges : list of ndarrays
A list of D arrays describing the (nxi + 1) bin edges for each
dimension.
binnumber : (N,) array of ints or (D,N) ndarray of ints
This assigns to each element of `sample` an integer that represents the
bin in which this observation falls. The representation depends on the
`expand_binnumbers` argument. See `Notes` for details.
See Also
--------
numpy.digitize, numpy.histogramdd, binned_statistic, binned_statistic_2d
Notes
-----
Binedges:
All but the last (righthand-most) bin is half-open in each dimension. In
other words, if `bins` is ``[1, 2, 3, 4]``, then the first bin is
``[1, 2)`` (including 1, but excluding 2) and the second ``[2, 3)``. The
last bin, however, is ``[3, 4]``, which *includes* 4.
`binnumber`:
This returned argument assigns to each element of `sample` an integer that
represents the bin in which it belongs. The representation depends on the
`expand_binnumbers` argument. If 'False' (default): The returned
`binnumber` is a shape (N,) array of linearized indices mapping each
element of `sample` to its corresponding bin (using row-major ordering).
If 'True': The returned `binnumber` is a shape (D,N) ndarray where
each row indicates bin placements for each dimension respectively. In each
dimension, a binnumber of `i` means the corresponding value is between
(bin_edges[D][i-1], bin_edges[D][i]), for each dimension 'D'.
.. versionadded:: 0.11.0
Examples
--------
>>> import numpy as np
>>> from scipy import stats
>>> import matplotlib.pyplot as plt
>>> from mpl_toolkits.mplot3d import Axes3D
Take an array of 600 (x, y) coordinates as an example.
`binned_statistic_dd` can handle arrays of higher dimension `D`. But a plot
of dimension `D+1` is required.
>>> mu = np.array([0., 1.])
>>> sigma = np.array([[1., -0.5],[-0.5, 1.5]])
>>> multinormal = stats.multivariate_normal(mu, sigma)
>>> data = multinormal.rvs(size=600, random_state=235412)
>>> data.shape
(600, 2)
Create bins and count how many arrays fall in each bin:
>>> N = 60
>>> x = np.linspace(-3, 3, N)
>>> y = np.linspace(-3, 4, N)
>>> ret = stats.binned_statistic_dd(data, np.arange(600), bins=[x, y],
... statistic='count')
>>> bincounts = ret.statistic
Set the volume and the location of bars:
>>> dx = x[1] - x[0]
>>> dy = y[1] - y[0]
>>> x, y = np.meshgrid(x[:-1]+dx/2, y[:-1]+dy/2)
>>> z = 0
>>> bincounts = bincounts.ravel()
>>> x = x.ravel()
>>> y = y.ravel()
>>> fig = plt.figure()
>>> ax = fig.add_subplot(111, projection='3d')
>>> with np.errstate(divide='ignore'): # silence random axes3d warning
... ax.bar3d(x, y, z, dx, dy, bincounts)
Reuse bin numbers and bin edges with new values:
>>> ret2 = stats.binned_statistic_dd(data, -np.arange(600),
... binned_statistic_result=ret,
... statistic='mean')
"""
known_stats = ['mean', 'median', 'count', 'sum', 'std', 'min', 'max']
if not callable(statistic) and statistic not in known_stats:
raise ValueError(f'invalid statistic {statistic!r}')
try:
bins = index(bins)
except TypeError:
# bins is not an integer
pass
# If bins was an integer-like object, now it is an actual Python int.
# NOTE: for _bin_edges(), see e.g. gh-11365
if isinstance(bins, int) and not np.isfinite(sample).all():
raise ValueError(f'{sample!r} contains non-finite values.')
# `Ndim` is the number of dimensions (e.g. `2` for `binned_statistic_2d`)
# `Dlen` is the length of elements along each dimension.
# This code is based on np.histogramdd
try:
# `sample` is an ND-array.
Dlen, Ndim = sample.shape
except (AttributeError, ValueError):
# `sample` is a sequence of 1D arrays.
sample = np.atleast_2d(sample).T
Dlen, Ndim = sample.shape
# Store initial shape of `values` to preserve it in the output
values = np.asarray(values)
input_shape = list(values.shape)
# Make sure that `values` is 2D to iterate over rows
values = np.atleast_2d(values)
Vdim, Vlen = values.shape
# Make sure `values` match `sample`
if statistic != 'count' and Vlen != Dlen:
raise AttributeError('The number of `values` elements must match the '
'length of each `sample` dimension.')
try:
M = len(bins)
if M != Ndim:
raise AttributeError('The dimension of bins must be equal '
'to the dimension of the sample x.')
except TypeError:
bins = Ndim * [bins]
if binned_statistic_result is None:
nbin, edges, dedges = _bin_edges(sample, bins, range)
binnumbers = _bin_numbers(sample, nbin, edges, dedges)
else:
edges = binned_statistic_result.bin_edges
nbin = np.array([len(edges[i]) + 1 for i in builtins.range(Ndim)])
# +1 for outlier bins
dedges = [np.diff(edges[i]) for i in builtins.range(Ndim)]
binnumbers = binned_statistic_result.binnumber
# Avoid overflow with double precision. Complex `values` -> `complex128`.
result_type = np.result_type(values, np.float64)
result = np.empty([Vdim, nbin.prod()], dtype=result_type)
if statistic in {'mean', np.mean}:
result.fill(np.nan)
flatcount = _bincount(binnumbers, None)
a = flatcount.nonzero()
for vv in builtins.range(Vdim):
flatsum = _bincount(binnumbers, values[vv])
result[vv, a] = flatsum[a] / flatcount[a]
elif statistic in {'std', np.std}:
result.fill(np.nan)
flatcount = _bincount(binnumbers, None)
a = flatcount.nonzero()
for vv in builtins.range(Vdim):
flatsum = _bincount(binnumbers, values[vv])
delta = values[vv] - flatsum[binnumbers] / flatcount[binnumbers]
std = np.sqrt(
_bincount(binnumbers, delta*np.conj(delta))[a] / flatcount[a]
)
result[vv, a] = std
result = np.real(result)
elif statistic == 'count':
result = np.empty([Vdim, nbin.prod()], dtype=np.float64)
result.fill(0)
flatcount = _bincount(binnumbers, None)
a = np.arange(len(flatcount))
result[:, a] = flatcount[np.newaxis, :]
elif statistic in {'sum', np.sum}:
result.fill(0)
for vv in builtins.range(Vdim):
flatsum = _bincount(binnumbers, values[vv])
a = np.arange(len(flatsum))
result[vv, a] = flatsum
elif statistic in {'median', np.median}:
result.fill(np.nan)
for vv in builtins.range(Vdim):
i = np.lexsort((values[vv], binnumbers))
_, j, counts = np.unique(binnumbers[i],
return_index=True, return_counts=True)
mid = j + (counts - 1) / 2
mid_a = values[vv, i][np.floor(mid).astype(int)]
mid_b = values[vv, i][np.ceil(mid).astype(int)]
medians = (mid_a + mid_b) / 2
result[vv, binnumbers[i][j]] = medians
elif statistic in {'min', np.min}:
result.fill(np.nan)
for vv in builtins.range(Vdim):
i = np.argsort(values[vv])[::-1] # Reversed so the min is last
result[vv, binnumbers[i]] = values[vv, i]
elif statistic in {'max', np.max}:
result.fill(np.nan)
for vv in builtins.range(Vdim):
i = np.argsort(values[vv])
result[vv, binnumbers[i]] = values[vv, i]
elif callable(statistic):
with np.errstate(invalid='ignore'), catch_warnings():
simplefilter("ignore", RuntimeWarning)
try:
null = statistic([])
except Exception:
null = np.nan
if np.iscomplexobj(null):
result = result.astype(np.complex128)
result.fill(null)
try:
_calc_binned_statistic(
Vdim, binnumbers, result, values, statistic
)
except ValueError:
result = result.astype(np.complex128)
_calc_binned_statistic(
Vdim, binnumbers, result, values, statistic
)
# Shape into a proper matrix
result = result.reshape(np.append(Vdim, nbin))
# Remove outliers (indices 0 and -1 for each bin-dimension).
core = tuple([slice(None)] + Ndim * [slice(1, -1)])
result = result[core]
# Unravel binnumbers into an ndarray, each row the bins for each dimension
if expand_binnumbers and Ndim > 1:
binnumbers = np.asarray(np.unravel_index(binnumbers, nbin))
if np.any(result.shape[1:] != nbin - 2):
raise RuntimeError('Internal Shape Error')
# Reshape to have output (`result`) match input (`values`) shape
result = result.reshape(input_shape[:-1] + list(nbin-2))
return BinnedStatisticddResult(result, edges, binnumbers)
def _calc_binned_statistic(Vdim, bin_numbers, result, values, stat_func):
unique_bin_numbers = np.unique(bin_numbers)
for vv in builtins.range(Vdim):
bin_map = _create_binned_data(bin_numbers, unique_bin_numbers,
values, vv)
for i in unique_bin_numbers:
stat = stat_func(np.array(bin_map[i]))
if np.iscomplexobj(stat) and not np.iscomplexobj(result):
raise ValueError("The statistic function returns complex ")
result[vv, i] = stat
def _create_binned_data(bin_numbers, unique_bin_numbers, values, vv):
""" Create hashmap of bin ids to values in bins
key: bin number
value: list of binned data
"""
bin_map = dict()
for i in unique_bin_numbers:
bin_map[i] = []
for i in builtins.range(len(bin_numbers)):
bin_map[bin_numbers[i]].append(values[vv, i])
return bin_map
def _bin_edges(sample, bins=None, range=None):
""" Create edge arrays
"""
Dlen, Ndim = sample.shape
nbin = np.empty(Ndim, int) # Number of bins in each dimension
edges = Ndim * [None] # Bin edges for each dim (will be 2D array)
dedges = Ndim * [None] # Spacing between edges (will be 2D array)
# Select range for each dimension
# Used only if number of bins is given.
if range is None:
smin = np.atleast_1d(np.array(sample.min(axis=0), float))
smax = np.atleast_1d(np.array(sample.max(axis=0), float))
else:
if len(range) != Ndim:
raise ValueError(
f"range given for {len(range)} dimensions; {Ndim} required")
smin = np.empty(Ndim)
smax = np.empty(Ndim)
for i in builtins.range(Ndim):
if range[i][1] < range[i][0]:
raise ValueError(
"In {}range, start must be <= stop".format(
f"dimension {i + 1} of " if Ndim > 1 else ""))
smin[i], smax[i] = range[i]
# Make sure the bins have a finite width.
for i in builtins.range(len(smin)):
if smin[i] == smax[i]:
smin[i] = smin[i] - .5
smax[i] = smax[i] + .5
# Preserve sample floating point precision in bin edges
edges_dtype = (sample.dtype if np.issubdtype(sample.dtype, np.floating)
else float)
# Create edge arrays
for i in builtins.range(Ndim):
if np.isscalar(bins[i]):
nbin[i] = bins[i] + 2 # +2 for outlier bins
edges[i] = np.linspace(smin[i], smax[i], nbin[i] - 1,
dtype=edges_dtype)
else:
edges[i] = np.asarray(bins[i], edges_dtype)
nbin[i] = len(edges[i]) + 1 # +1 for outlier bins
dedges[i] = np.diff(edges[i])
nbin = np.asarray(nbin)
return nbin, edges, dedges
def _bin_numbers(sample, nbin, edges, dedges):
"""Compute the bin number each sample falls into, in each dimension
"""
Dlen, Ndim = sample.shape
sampBin = [
np.digitize(sample[:, i], edges[i])
for i in range(Ndim)
]
# Using `digitize`, values that fall on an edge are put in the right bin.
# For the rightmost bin, we want values equal to the right
# edge to be counted in the last bin, and not as an outlier.
for i in range(Ndim):
# Find the rounding precision
dedges_min = dedges[i].min()
if dedges_min == 0:
raise ValueError('The smallest edge difference is numerically 0.')
decimal = int(-np.log10(dedges_min)) + 6
# Find which points are on the rightmost edge.
on_edge = np.where((sample[:, i] >= edges[i][-1]) &
(np.around(sample[:, i], decimal) ==
np.around(edges[i][-1], decimal)))[0]
# Shift these points one bin to the left.
sampBin[i][on_edge] -= 1
# Compute the sample indices in the flattened statistic matrix.
binnumbers = np.ravel_multi_index(sampBin, nbin)
return binnumbers

View File

@ -0,0 +1,375 @@
from math import sqrt
import numpy as np
from scipy._lib._util import _validate_int
from scipy.optimize import brentq
from scipy.special import ndtri
from ._discrete_distns import binom
from ._common import ConfidenceInterval
class BinomTestResult:
"""
Result of `scipy.stats.binomtest`.
Attributes
----------
k : int
The number of successes (copied from `binomtest` input).
n : int
The number of trials (copied from `binomtest` input).
alternative : str
Indicates the alternative hypothesis specified in the input
to `binomtest`. It will be one of ``'two-sided'``, ``'greater'``,
or ``'less'``.
statistic: float
The estimate of the proportion of successes.
pvalue : float
The p-value of the hypothesis test.
"""
def __init__(self, k, n, alternative, statistic, pvalue):
self.k = k
self.n = n
self.alternative = alternative
self.statistic = statistic
self.pvalue = pvalue
# add alias for backward compatibility
self.proportion_estimate = statistic
def __repr__(self):
s = ("BinomTestResult("
f"k={self.k}, "
f"n={self.n}, "
f"alternative={self.alternative!r}, "
f"statistic={self.statistic}, "
f"pvalue={self.pvalue})")
return s
def proportion_ci(self, confidence_level=0.95, method='exact'):
"""
Compute the confidence interval for ``statistic``.
Parameters
----------
confidence_level : float, optional
Confidence level for the computed confidence interval
of the estimated proportion. Default is 0.95.
method : {'exact', 'wilson', 'wilsoncc'}, optional
Selects the method used to compute the confidence interval
for the estimate of the proportion:
'exact' :
Use the Clopper-Pearson exact method [1]_.
'wilson' :
Wilson's method, without continuity correction ([2]_, [3]_).
'wilsoncc' :
Wilson's method, with continuity correction ([2]_, [3]_).
Default is ``'exact'``.
Returns
-------
ci : ``ConfidenceInterval`` object
The object has attributes ``low`` and ``high`` that hold the
lower and upper bounds of the confidence interval.
References
----------
.. [1] C. J. Clopper and E. S. Pearson, The use of confidence or
fiducial limits illustrated in the case of the binomial,
Biometrika, Vol. 26, No. 4, pp 404-413 (Dec. 1934).
.. [2] E. B. Wilson, Probable inference, the law of succession, and
statistical inference, J. Amer. Stat. Assoc., 22, pp 209-212
(1927).
.. [3] Robert G. Newcombe, Two-sided confidence intervals for the
single proportion: comparison of seven methods, Statistics
in Medicine, 17, pp 857-872 (1998).
Examples
--------
>>> from scipy.stats import binomtest
>>> result = binomtest(k=7, n=50, p=0.1)
>>> result.statistic
0.14
>>> result.proportion_ci()
ConfidenceInterval(low=0.05819170033997342, high=0.26739600249700846)
"""
if method not in ('exact', 'wilson', 'wilsoncc'):
raise ValueError(f"method ('{method}') must be one of 'exact', "
"'wilson' or 'wilsoncc'.")
if not (0 <= confidence_level <= 1):
raise ValueError(f'confidence_level ({confidence_level}) must be in '
'the interval [0, 1].')
if method == 'exact':
low, high = _binom_exact_conf_int(self.k, self.n,
confidence_level,
self.alternative)
else:
# method is 'wilson' or 'wilsoncc'
low, high = _binom_wilson_conf_int(self.k, self.n,
confidence_level,
self.alternative,
correction=method == 'wilsoncc')
return ConfidenceInterval(low=low, high=high)
def _findp(func):
try:
p = brentq(func, 0, 1)
except RuntimeError:
raise RuntimeError('numerical solver failed to converge when '
'computing the confidence limits') from None
except ValueError as exc:
raise ValueError('brentq raised a ValueError; report this to the '
'SciPy developers') from exc
return p
def _binom_exact_conf_int(k, n, confidence_level, alternative):
"""
Compute the estimate and confidence interval for the binomial test.
Returns proportion, prop_low, prop_high
"""
if alternative == 'two-sided':
alpha = (1 - confidence_level) / 2
if k == 0:
plow = 0.0
else:
plow = _findp(lambda p: binom.sf(k-1, n, p) - alpha)
if k == n:
phigh = 1.0
else:
phigh = _findp(lambda p: binom.cdf(k, n, p) - alpha)
elif alternative == 'less':
alpha = 1 - confidence_level
plow = 0.0
if k == n:
phigh = 1.0
else:
phigh = _findp(lambda p: binom.cdf(k, n, p) - alpha)
elif alternative == 'greater':
alpha = 1 - confidence_level
if k == 0:
plow = 0.0
else:
plow = _findp(lambda p: binom.sf(k-1, n, p) - alpha)
phigh = 1.0
return plow, phigh
def _binom_wilson_conf_int(k, n, confidence_level, alternative, correction):
# This function assumes that the arguments have already been validated.
# In particular, `alternative` must be one of 'two-sided', 'less' or
# 'greater'.
p = k / n
if alternative == 'two-sided':
z = ndtri(0.5 + 0.5*confidence_level)
else:
z = ndtri(confidence_level)
# For reference, the formulas implemented here are from
# Newcombe (1998) (ref. [3] in the proportion_ci docstring).
denom = 2*(n + z**2)
center = (2*n*p + z**2)/denom
q = 1 - p
if correction:
if alternative == 'less' or k == 0:
lo = 0.0
else:
dlo = (1 + z*sqrt(z**2 - 2 - 1/n + 4*p*(n*q + 1))) / denom
lo = center - dlo
if alternative == 'greater' or k == n:
hi = 1.0
else:
dhi = (1 + z*sqrt(z**2 + 2 - 1/n + 4*p*(n*q - 1))) / denom
hi = center + dhi
else:
delta = z/denom * sqrt(4*n*p*q + z**2)
if alternative == 'less' or k == 0:
lo = 0.0
else:
lo = center - delta
if alternative == 'greater' or k == n:
hi = 1.0
else:
hi = center + delta
return lo, hi
def binomtest(k, n, p=0.5, alternative='two-sided'):
"""
Perform a test that the probability of success is p.
The binomial test [1]_ is a test of the null hypothesis that the
probability of success in a Bernoulli experiment is `p`.
Details of the test can be found in many texts on statistics, such
as section 24.5 of [2]_.
Parameters
----------
k : int
The number of successes.
n : int
The number of trials.
p : float, optional
The hypothesized probability of success, i.e. the expected
proportion of successes. The value must be in the interval
``0 <= p <= 1``. The default value is ``p = 0.5``.
alternative : {'two-sided', 'greater', 'less'}, optional
Indicates the alternative hypothesis. The default value is
'two-sided'.
Returns
-------
result : `~scipy.stats._result_classes.BinomTestResult` instance
The return value is an object with the following attributes:
k : int
The number of successes (copied from `binomtest` input).
n : int
The number of trials (copied from `binomtest` input).
alternative : str
Indicates the alternative hypothesis specified in the input
to `binomtest`. It will be one of ``'two-sided'``, ``'greater'``,
or ``'less'``.
statistic : float
The estimate of the proportion of successes.
pvalue : float
The p-value of the hypothesis test.
The object has the following methods:
proportion_ci(confidence_level=0.95, method='exact') :
Compute the confidence interval for ``statistic``.
Notes
-----
.. versionadded:: 1.7.0
References
----------
.. [1] Binomial test, https://en.wikipedia.org/wiki/Binomial_test
.. [2] Jerrold H. Zar, Biostatistical Analysis (fifth edition),
Prentice Hall, Upper Saddle River, New Jersey USA (2010)
Examples
--------
>>> from scipy.stats import binomtest
A car manufacturer claims that no more than 10% of their cars are unsafe.
15 cars are inspected for safety, 3 were found to be unsafe. Test the
manufacturer's claim:
>>> result = binomtest(3, n=15, p=0.1, alternative='greater')
>>> result.pvalue
0.18406106910639114
The null hypothesis cannot be rejected at the 5% level of significance
because the returned p-value is greater than the critical value of 5%.
The test statistic is equal to the estimated proportion, which is simply
``3/15``:
>>> result.statistic
0.2
We can use the `proportion_ci()` method of the result to compute the
confidence interval of the estimate:
>>> result.proportion_ci(confidence_level=0.95)
ConfidenceInterval(low=0.05684686759024681, high=1.0)
"""
k = _validate_int(k, 'k', minimum=0)
n = _validate_int(n, 'n', minimum=1)
if k > n:
raise ValueError(f'k ({k}) must not be greater than n ({n}).')
if not (0 <= p <= 1):
raise ValueError(f"p ({p}) must be in range [0,1]")
if alternative not in ('two-sided', 'less', 'greater'):
raise ValueError(f"alternative ('{alternative}') not recognized; \n"
"must be 'two-sided', 'less' or 'greater'")
if alternative == 'less':
pval = binom.cdf(k, n, p)
elif alternative == 'greater':
pval = binom.sf(k-1, n, p)
else:
# alternative is 'two-sided'
d = binom.pmf(k, n, p)
rerr = 1 + 1e-7
if k == p * n:
# special case as shortcut, would also be handled by `else` below
pval = 1.
elif k < p * n:
ix = _binary_search_for_binom_tst(lambda x1: -binom.pmf(x1, n, p),
-d*rerr, np.ceil(p * n), n)
# y is the number of terms between mode and n that are <= d*rerr.
# ix gave us the first term where a(ix) <= d*rerr < a(ix-1)
# if the first equality doesn't hold, y=n-ix. Otherwise, we
# need to include ix as well as the equality holds. Note that
# the equality will hold in very very rare situations due to rerr.
y = n - ix + int(d*rerr == binom.pmf(ix, n, p))
pval = binom.cdf(k, n, p) + binom.sf(n - y, n, p)
else:
ix = _binary_search_for_binom_tst(lambda x1: binom.pmf(x1, n, p),
d*rerr, 0, np.floor(p * n))
# y is the number of terms between 0 and mode that are <= d*rerr.
# we need to add a 1 to account for the 0 index.
# For comparing this with old behavior, see
# tst_binary_srch_for_binom_tst method in test_morestats.
y = ix + 1
pval = binom.cdf(y-1, n, p) + binom.sf(k-1, n, p)
pval = min(1.0, pval)
result = BinomTestResult(k=k, n=n, alternative=alternative,
statistic=k/n, pvalue=pval)
return result
def _binary_search_for_binom_tst(a, d, lo, hi):
"""
Conducts an implicit binary search on a function specified by `a`.
Meant to be used on the binomial PMF for the case of two-sided tests
to obtain the value on the other side of the mode where the tail
probability should be computed. The values on either side of
the mode are always in order, meaning binary search is applicable.
Parameters
----------
a : callable
The function over which to perform binary search. Its values
for inputs lo and hi should be in ascending order.
d : float
The value to search.
lo : int
The lower end of range to search.
hi : int
The higher end of the range to search.
Returns
-------
int
The index, i between lo and hi
such that a(i)<=d<a(i+1)
"""
while lo < hi:
mid = lo + (hi-lo)//2
midval = a(mid)
if midval < d:
lo = mid+1
elif midval > d:
hi = mid-1
else:
return mid
if a(lo) <= d:
return lo
else:
return lo-1

View File

@ -0,0 +1,177 @@
import numpy as np
from functools import partial
from scipy import stats
def _bws_input_validation(x, y, alternative, method):
''' Input validation and standardization for bws test'''
x, y = np.atleast_1d(x, y)
if x.ndim > 1 or y.ndim > 1:
raise ValueError('`x` and `y` must be exactly one-dimensional.')
if np.isnan(x).any() or np.isnan(y).any():
raise ValueError('`x` and `y` must not contain NaNs.')
if np.size(x) == 0 or np.size(y) == 0:
raise ValueError('`x` and `y` must be of nonzero size.')
z = stats.rankdata(np.concatenate((x, y)))
x, y = z[:len(x)], z[len(x):]
alternatives = {'two-sided', 'less', 'greater'}
alternative = alternative.lower()
if alternative not in alternatives:
raise ValueError(f'`alternative` must be one of {alternatives}.')
method = stats.PermutationMethod() if method is None else method
if not isinstance(method, stats.PermutationMethod):
raise ValueError('`method` must be an instance of '
'`scipy.stats.PermutationMethod`')
return x, y, alternative, method
def _bws_statistic(x, y, alternative, axis):
'''Compute the BWS test statistic for two independent samples'''
# Public function currently does not accept `axis`, but `permutation_test`
# uses `axis` to make vectorized call.
Ri, Hj = np.sort(x, axis=axis), np.sort(y, axis=axis)
n, m = Ri.shape[axis], Hj.shape[axis]
i, j = np.arange(1, n+1), np.arange(1, m+1)
Bx_num = Ri - (m + n)/n * i
By_num = Hj - (m + n)/m * j
if alternative == 'two-sided':
Bx_num *= Bx_num
By_num *= By_num
else:
Bx_num *= np.abs(Bx_num)
By_num *= np.abs(By_num)
Bx_den = i/(n+1) * (1 - i/(n+1)) * m*(m+n)/n
By_den = j/(m+1) * (1 - j/(m+1)) * n*(m+n)/m
Bx = 1/n * np.sum(Bx_num/Bx_den, axis=axis)
By = 1/m * np.sum(By_num/By_den, axis=axis)
B = (Bx + By) / 2 if alternative == 'two-sided' else (Bx - By) / 2
return B
def bws_test(x, y, *, alternative="two-sided", method=None):
r'''Perform the Baumgartner-Weiss-Schindler test on two independent samples.
The Baumgartner-Weiss-Schindler (BWS) test is a nonparametric test of
the null hypothesis that the distribution underlying sample `x`
is the same as the distribution underlying sample `y`. Unlike
the Kolmogorov-Smirnov, Wilcoxon, and Cramer-Von Mises tests,
the BWS test weights the integral by the variance of the difference
in cumulative distribution functions (CDFs), emphasizing the tails of the
distributions, which increases the power of the test in many applications.
Parameters
----------
x, y : array-like
1-d arrays of samples.
alternative : {'two-sided', 'less', 'greater'}, optional
Defines the alternative hypothesis. Default is 'two-sided'.
Let *F(u)* and *G(u)* be the cumulative distribution functions of the
distributions underlying `x` and `y`, respectively. Then the following
alternative hypotheses are available:
* 'two-sided': the distributions are not equal, i.e. *F(u) ≠ G(u)* for
at least one *u*.
* 'less': the distribution underlying `x` is stochastically less than
the distribution underlying `y`, i.e. *F(u) >= G(u)* for all *u*.
* 'greater': the distribution underlying `x` is stochastically greater
than the distribution underlying `y`, i.e. *F(u) <= G(u)* for all
*u*.
Under a more restrictive set of assumptions, the alternative hypotheses
can be expressed in terms of the locations of the distributions;
see [2] section 5.1.
method : PermutationMethod, optional
Configures the method used to compute the p-value. The default is
the default `PermutationMethod` object.
Returns
-------
res : PermutationTestResult
An object with attributes:
statistic : float
The observed test statistic of the data.
pvalue : float
The p-value for the given alternative.
null_distribution : ndarray
The values of the test statistic generated under the null hypothesis.
See also
--------
scipy.stats.wilcoxon, scipy.stats.mannwhitneyu, scipy.stats.ttest_ind
Notes
-----
When ``alternative=='two-sided'``, the statistic is defined by the
equations given in [1]_ Section 2. This statistic is not appropriate for
one-sided alternatives; in that case, the statistic is the *negative* of
that given by the equations in [1]_ Section 2. Consequently, when the
distribution of the first sample is stochastically greater than that of the
second sample, the statistic will tend to be positive.
References
----------
.. [1] Neuhäuser, M. (2005). Exact Tests Based on the
Baumgartner-Weiss-Schindler Statistic: A Survey. Statistical Papers,
46(1), 1-29.
.. [2] Fay, M. P., & Proschan, M. A. (2010). Wilcoxon-Mann-Whitney or t-test?
On assumptions for hypothesis tests and multiple interpretations of
decision rules. Statistics surveys, 4, 1.
Examples
--------
We follow the example of table 3 in [1]_: Fourteen children were divided
randomly into two groups. Their ranks at performing a specific tests are
as follows.
>>> import numpy as np
>>> x = [1, 2, 3, 4, 6, 7, 8]
>>> y = [5, 9, 10, 11, 12, 13, 14]
We use the BWS test to assess whether there is a statistically significant
difference between the two groups.
The null hypothesis is that there is no difference in the distributions of
performance between the two groups. We decide that a significance level of
1% is required to reject the null hypothesis in favor of the alternative
that the distributions are different.
Since the number of samples is very small, we can compare the observed test
statistic against the *exact* distribution of the test statistic under the
null hypothesis.
>>> from scipy.stats import bws_test
>>> res = bws_test(x, y)
>>> print(res.statistic)
5.132167152575315
This agrees with :math:`B = 5.132` reported in [1]_. The *p*-value produced
by `bws_test` also agrees with :math:`p = 0.0029` reported in [1]_.
>>> print(res.pvalue)
0.002913752913752914
Because the p-value is below our threshold of 1%, we take this as evidence
against the null hypothesis in favor of the alternative that there is a
difference in performance between the two groups.
'''
x, y, alternative, method = _bws_input_validation(x, y, alternative,
method)
bws_statistic = partial(_bws_statistic, alternative=alternative)
permutation_alternative = 'less' if alternative == 'less' else 'greater'
res = stats.permutation_test((x, y), bws_statistic,
alternative=permutation_alternative,
**method._asdict())
return res

View File

@ -0,0 +1,459 @@
import numpy as np
def _validate_1d(a, name, allow_inf=False):
if np.ndim(a) != 1:
raise ValueError(f'`{name}` must be a one-dimensional sequence.')
if np.isnan(a).any():
raise ValueError(f'`{name}` must not contain nan.')
if not allow_inf and np.isinf(a).any():
raise ValueError(f'`{name}` must contain only finite values.')
def _validate_interval(interval):
interval = np.asarray(interval)
if interval.shape == (0,):
# The input was a sequence with length 0.
interval = interval.reshape((0, 2))
if interval.ndim != 2 or interval.shape[-1] != 2:
raise ValueError('`interval` must be a two-dimensional array with '
'shape (m, 2), where m is the number of '
'interval-censored values, but got shape '
f'{interval.shape}')
if np.isnan(interval).any():
raise ValueError('`interval` must not contain nan.')
if np.isinf(interval).all(axis=1).any():
raise ValueError('In each row in `interval`, both values must not'
' be infinite.')
if (interval[:, 0] > interval[:, 1]).any():
raise ValueError('In each row of `interval`, the left value must not'
' exceed the right value.')
uncensored_mask = interval[:, 0] == interval[:, 1]
left_mask = np.isinf(interval[:, 0])
right_mask = np.isinf(interval[:, 1])
interval_mask = np.isfinite(interval).all(axis=1) & ~uncensored_mask
uncensored2 = interval[uncensored_mask, 0]
left2 = interval[left_mask, 1]
right2 = interval[right_mask, 0]
interval2 = interval[interval_mask]
return uncensored2, left2, right2, interval2
def _validate_x_censored(x, censored):
x = np.asarray(x)
if x.ndim != 1:
raise ValueError('`x` must be one-dimensional.')
censored = np.asarray(censored)
if censored.ndim != 1:
raise ValueError('`censored` must be one-dimensional.')
if (~np.isfinite(x)).any():
raise ValueError('`x` must not contain nan or inf.')
if censored.size != x.size:
raise ValueError('`x` and `censored` must have the same length.')
return x, censored.astype(bool)
class CensoredData:
"""
Instances of this class represent censored data.
Instances may be passed to the ``fit`` method of continuous
univariate SciPy distributions for maximum likelihood estimation.
The *only* method of the univariate continuous distributions that
understands `CensoredData` is the ``fit`` method. An instance of
`CensoredData` can not be passed to methods such as ``pdf`` and
``cdf``.
An observation is said to be *censored* when the precise value is unknown,
but it has a known upper and/or lower bound. The conventional terminology
is:
* left-censored: an observation is below a certain value but it is
unknown by how much.
* right-censored: an observation is above a certain value but it is
unknown by how much.
* interval-censored: an observation lies somewhere on an interval between
two values.
Left-, right-, and interval-censored data can be represented by
`CensoredData`.
For convenience, the class methods ``left_censored`` and
``right_censored`` are provided to create a `CensoredData`
instance from a single one-dimensional array of measurements
and a corresponding boolean array to indicate which measurements
are censored. The class method ``interval_censored`` accepts two
one-dimensional arrays that hold the lower and upper bounds of the
intervals.
Parameters
----------
uncensored : array_like, 1D
Uncensored observations.
left : array_like, 1D
Left-censored observations.
right : array_like, 1D
Right-censored observations.
interval : array_like, 2D, with shape (m, 2)
Interval-censored observations. Each row ``interval[k, :]``
represents the interval for the kth interval-censored observation.
Notes
-----
In the input array `interval`, the lower bound of the interval may
be ``-inf``, and the upper bound may be ``inf``, but at least one must be
finite. When the lower bound is ``-inf``, the row represents a left-
censored observation, and when the upper bound is ``inf``, the row
represents a right-censored observation. If the length of an interval
is 0 (i.e. ``interval[k, 0] == interval[k, 1]``, the observation is
treated as uncensored. So one can represent all the types of censored
and uncensored data in ``interval``, but it is generally more convenient
to use `uncensored`, `left` and `right` for uncensored, left-censored and
right-censored observations, respectively.
Examples
--------
In the most general case, a censored data set may contain values that
are left-censored, right-censored, interval-censored, and uncensored.
For example, here we create a data set with five observations. Two
are uncensored (values 1 and 1.5), one is a left-censored observation
of 0, one is a right-censored observation of 10 and one is
interval-censored in the interval [2, 3].
>>> import numpy as np
>>> from scipy.stats import CensoredData
>>> data = CensoredData(uncensored=[1, 1.5], left=[0], right=[10],
... interval=[[2, 3]])
>>> print(data)
CensoredData(5 values: 2 not censored, 1 left-censored,
1 right-censored, 1 interval-censored)
Equivalently,
>>> data = CensoredData(interval=[[1, 1],
... [1.5, 1.5],
... [-np.inf, 0],
... [10, np.inf],
... [2, 3]])
>>> print(data)
CensoredData(5 values: 2 not censored, 1 left-censored,
1 right-censored, 1 interval-censored)
A common case is to have a mix of uncensored observations and censored
observations that are all right-censored (or all left-censored). For
example, consider an experiment in which six devices are started at
various times and left running until they fail. Assume that time is
measured in hours, and the experiment is stopped after 30 hours, even
if all the devices have not failed by that time. We might end up with
data such as this::
Device Start-time Fail-time Time-to-failure
1 0 13 13
2 2 24 22
3 5 22 17
4 8 23 15
5 10 *** >20
6 12 *** >18
Two of the devices had not failed when the experiment was stopped;
the observations of the time-to-failure for these two devices are
right-censored. We can represent this data with
>>> data = CensoredData(uncensored=[13, 22, 17, 15], right=[20, 18])
>>> print(data)
CensoredData(6 values: 4 not censored, 2 right-censored)
Alternatively, we can use the method `CensoredData.right_censored` to
create a representation of this data. The time-to-failure observations
are put the list ``ttf``. The ``censored`` list indicates which values
in ``ttf`` are censored.
>>> ttf = [13, 22, 17, 15, 20, 18]
>>> censored = [False, False, False, False, True, True]
Pass these lists to `CensoredData.right_censored` to create an
instance of `CensoredData`.
>>> data = CensoredData.right_censored(ttf, censored)
>>> print(data)
CensoredData(6 values: 4 not censored, 2 right-censored)
If the input data is interval censored and already stored in two
arrays, one holding the low end of the intervals and another
holding the high ends, the class method ``interval_censored`` can
be used to create the `CensoredData` instance.
This example creates an instance with four interval-censored values.
The intervals are [10, 11], [0.5, 1], [2, 3], and [12.5, 13.5].
>>> a = [10, 0.5, 2, 12.5] # Low ends of the intervals
>>> b = [11, 1.0, 3, 13.5] # High ends of the intervals
>>> data = CensoredData.interval_censored(low=a, high=b)
>>> print(data)
CensoredData(4 values: 0 not censored, 4 interval-censored)
Finally, we create and censor some data from the `weibull_min`
distribution, and then fit `weibull_min` to that data. We'll assume
that the location parameter is known to be 0.
>>> from scipy.stats import weibull_min
>>> rng = np.random.default_rng()
Create the random data set.
>>> x = weibull_min.rvs(2.5, loc=0, scale=30, size=250, random_state=rng)
>>> x[x > 40] = 40 # Right-censor values greater or equal to 40.
Create the `CensoredData` instance with the `right_censored` method.
The censored values are those where the value is 40.
>>> data = CensoredData.right_censored(x, x == 40)
>>> print(data)
CensoredData(250 values: 215 not censored, 35 right-censored)
35 values have been right-censored.
Fit `weibull_min` to the censored data. We expect to shape and scale
to be approximately 2.5 and 30, respectively.
>>> weibull_min.fit(data, floc=0)
(2.3575922823897315, 0, 30.40650074451254)
"""
def __init__(self, uncensored=None, *, left=None, right=None,
interval=None):
if uncensored is None:
uncensored = []
if left is None:
left = []
if right is None:
right = []
if interval is None:
interval = np.empty((0, 2))
_validate_1d(uncensored, 'uncensored')
_validate_1d(left, 'left')
_validate_1d(right, 'right')
uncensored2, left2, right2, interval2 = _validate_interval(interval)
self._uncensored = np.concatenate((uncensored, uncensored2))
self._left = np.concatenate((left, left2))
self._right = np.concatenate((right, right2))
# Note that by construction, the private attribute _interval
# will be a 2D array that contains only finite values representing
# intervals with nonzero but finite length.
self._interval = interval2
def __repr__(self):
uncensored_str = " ".join(np.array_repr(self._uncensored).split())
left_str = " ".join(np.array_repr(self._left).split())
right_str = " ".join(np.array_repr(self._right).split())
interval_str = " ".join(np.array_repr(self._interval).split())
return (f"CensoredData(uncensored={uncensored_str}, left={left_str}, "
f"right={right_str}, interval={interval_str})")
def __str__(self):
num_nc = len(self._uncensored)
num_lc = len(self._left)
num_rc = len(self._right)
num_ic = len(self._interval)
n = num_nc + num_lc + num_rc + num_ic
parts = [f'{num_nc} not censored']
if num_lc > 0:
parts.append(f'{num_lc} left-censored')
if num_rc > 0:
parts.append(f'{num_rc} right-censored')
if num_ic > 0:
parts.append(f'{num_ic} interval-censored')
return f'CensoredData({n} values: ' + ', '.join(parts) + ')'
# This is not a complete implementation of the arithmetic operators.
# All we need is subtracting a scalar and dividing by a scalar.
def __sub__(self, other):
return CensoredData(uncensored=self._uncensored - other,
left=self._left - other,
right=self._right - other,
interval=self._interval - other)
def __truediv__(self, other):
return CensoredData(uncensored=self._uncensored / other,
left=self._left / other,
right=self._right / other,
interval=self._interval / other)
def __len__(self):
"""
The number of values (censored and not censored).
"""
return (len(self._uncensored) + len(self._left) + len(self._right)
+ len(self._interval))
def num_censored(self):
"""
Number of censored values.
"""
return len(self._left) + len(self._right) + len(self._interval)
@classmethod
def right_censored(cls, x, censored):
"""
Create a `CensoredData` instance of right-censored data.
Parameters
----------
x : array_like
`x` is the array of observed data or measurements.
`x` must be a one-dimensional sequence of finite numbers.
censored : array_like of bool
`censored` must be a one-dimensional sequence of boolean
values. If ``censored[k]`` is True, the corresponding value
in `x` is right-censored. That is, the value ``x[k]``
is the lower bound of the true (but unknown) value.
Returns
-------
data : `CensoredData`
An instance of `CensoredData` that represents the
collection of uncensored and right-censored values.
Examples
--------
>>> from scipy.stats import CensoredData
Two uncensored values (4 and 10) and two right-censored values
(24 and 25).
>>> data = CensoredData.right_censored([4, 10, 24, 25],
... [False, False, True, True])
>>> data
CensoredData(uncensored=array([ 4., 10.]),
left=array([], dtype=float64), right=array([24., 25.]),
interval=array([], shape=(0, 2), dtype=float64))
>>> print(data)
CensoredData(4 values: 2 not censored, 2 right-censored)
"""
x, censored = _validate_x_censored(x, censored)
return cls(uncensored=x[~censored], right=x[censored])
@classmethod
def left_censored(cls, x, censored):
"""
Create a `CensoredData` instance of left-censored data.
Parameters
----------
x : array_like
`x` is the array of observed data or measurements.
`x` must be a one-dimensional sequence of finite numbers.
censored : array_like of bool
`censored` must be a one-dimensional sequence of boolean
values. If ``censored[k]`` is True, the corresponding value
in `x` is left-censored. That is, the value ``x[k]``
is the upper bound of the true (but unknown) value.
Returns
-------
data : `CensoredData`
An instance of `CensoredData` that represents the
collection of uncensored and left-censored values.
Examples
--------
>>> from scipy.stats import CensoredData
Two uncensored values (0.12 and 0.033) and two left-censored values
(both 1e-3).
>>> data = CensoredData.left_censored([0.12, 0.033, 1e-3, 1e-3],
... [False, False, True, True])
>>> data
CensoredData(uncensored=array([0.12 , 0.033]),
left=array([0.001, 0.001]), right=array([], dtype=float64),
interval=array([], shape=(0, 2), dtype=float64))
>>> print(data)
CensoredData(4 values: 2 not censored, 2 left-censored)
"""
x, censored = _validate_x_censored(x, censored)
return cls(uncensored=x[~censored], left=x[censored])
@classmethod
def interval_censored(cls, low, high):
"""
Create a `CensoredData` instance of interval-censored data.
This method is useful when all the data is interval-censored, and
the low and high ends of the intervals are already stored in
separate one-dimensional arrays.
Parameters
----------
low : array_like
The one-dimensional array containing the low ends of the
intervals.
high : array_like
The one-dimensional array containing the high ends of the
intervals.
Returns
-------
data : `CensoredData`
An instance of `CensoredData` that represents the
collection of censored values.
Examples
--------
>>> import numpy as np
>>> from scipy.stats import CensoredData
``a`` and ``b`` are the low and high ends of a collection of
interval-censored values.
>>> a = [0.5, 2.0, 3.0, 5.5]
>>> b = [1.0, 2.5, 3.5, 7.0]
>>> data = CensoredData.interval_censored(low=a, high=b)
>>> print(data)
CensoredData(4 values: 0 not censored, 4 interval-censored)
"""
_validate_1d(low, 'low', allow_inf=True)
_validate_1d(high, 'high', allow_inf=True)
if len(low) != len(high):
raise ValueError('`low` and `high` must have the same length.')
interval = np.column_stack((low, high))
uncensored, left, right, interval = _validate_interval(interval)
return cls(uncensored=uncensored, left=left, right=right,
interval=interval)
def _uncensor(self):
"""
This function is used when a non-censored version of the data
is needed to create a rough estimate of the parameters of a
distribution via the method of moments or some similar method.
The data is "uncensored" by taking the given endpoints as the
data for the left- or right-censored data, and the mean for the
interval-censored data.
"""
data = np.concatenate((self._uncensored, self._left, self._right,
self._interval.mean(axis=1)))
return data
def _supported(self, a, b):
"""
Return a subset of self containing the values that are in
(or overlap with) the interval (a, b).
"""
uncensored = self._uncensored
uncensored = uncensored[(a < uncensored) & (uncensored < b)]
left = self._left
left = left[a < left]
right = self._right
right = right[right < b]
interval = self._interval
interval = interval[(a < interval[:, 1]) & (interval[:, 0] < b)]
return CensoredData(uncensored, left=left, right=right,
interval=interval)

View File

@ -0,0 +1,5 @@
from collections import namedtuple
ConfidenceInterval = namedtuple("ConfidenceInterval", ["low", "high"])
ConfidenceInterval. __doc__ = "Class for confidence intervals."

View File

@ -0,0 +1,39 @@
"""
Statistics-related constants.
"""
import numpy as np
# The smallest representable positive number such that 1.0 + _EPS != 1.0.
_EPS = np.finfo(float).eps
# The largest [in magnitude] usable floating value.
_XMAX = np.finfo(float).max
# The log of the largest usable floating value; useful for knowing
# when exp(something) will overflow
_LOGXMAX = np.log(_XMAX)
# The smallest [in magnitude] usable (i.e. not subnormal) double precision
# floating value.
_XMIN = np.finfo(float).tiny
# The log of the smallest [in magnitude] usable (i.e not subnormal)
# double precision floating value.
_LOGXMIN = np.log(_XMIN)
# -special.psi(1)
_EULER = 0.577215664901532860606512090082402431042
# special.zeta(3, 1) Apery's constant
_ZETA3 = 1.202056903159594285399738161511449990765
# sqrt(pi)
_SQRT_PI = 1.772453850905516027298167483341145182798
# sqrt(2/pi)
_SQRT_2_OVER_PI = 0.7978845608028654
# log(sqrt(2/pi))
_LOG_SQRT_2_OVER_PI = -0.22579135264472744

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,633 @@
from functools import cached_property
import numpy as np
from scipy import linalg
from scipy.stats import _multivariate
__all__ = ["Covariance"]
class Covariance:
"""
Representation of a covariance matrix
Calculations involving covariance matrices (e.g. data whitening,
multivariate normal function evaluation) are often performed more
efficiently using a decomposition of the covariance matrix instead of the
covariance matrix itself. This class allows the user to construct an
object representing a covariance matrix using any of several
decompositions and perform calculations using a common interface.
.. note::
The `Covariance` class cannot be instantiated directly. Instead, use
one of the factory methods (e.g. `Covariance.from_diagonal`).
Examples
--------
The `Covariance` class is used by calling one of its
factory methods to create a `Covariance` object, then pass that
representation of the `Covariance` matrix as a shape parameter of a
multivariate distribution.
For instance, the multivariate normal distribution can accept an array
representing a covariance matrix:
>>> from scipy import stats
>>> import numpy as np
>>> d = [1, 2, 3]
>>> A = np.diag(d) # a diagonal covariance matrix
>>> x = [4, -2, 5] # a point of interest
>>> dist = stats.multivariate_normal(mean=[0, 0, 0], cov=A)
>>> dist.pdf(x)
4.9595685102808205e-08
but the calculations are performed in a very generic way that does not
take advantage of any special properties of the covariance matrix. Because
our covariance matrix is diagonal, we can use ``Covariance.from_diagonal``
to create an object representing the covariance matrix, and
`multivariate_normal` can use this to compute the probability density
function more efficiently.
>>> cov = stats.Covariance.from_diagonal(d)
>>> dist = stats.multivariate_normal(mean=[0, 0, 0], cov=cov)
>>> dist.pdf(x)
4.9595685102808205e-08
"""
def __init__(self):
message = ("The `Covariance` class cannot be instantiated directly. "
"Please use one of the factory methods "
"(e.g. `Covariance.from_diagonal`).")
raise NotImplementedError(message)
@staticmethod
def from_diagonal(diagonal):
r"""
Return a representation of a covariance matrix from its diagonal.
Parameters
----------
diagonal : array_like
The diagonal elements of a diagonal matrix.
Notes
-----
Let the diagonal elements of a diagonal covariance matrix :math:`D` be
stored in the vector :math:`d`.
When all elements of :math:`d` are strictly positive, whitening of a
data point :math:`x` is performed by computing
:math:`x \cdot d^{-1/2}`, where the inverse square root can be taken
element-wise.
:math:`\log\det{D}` is calculated as :math:`-2 \sum(\log{d})`,
where the :math:`\log` operation is performed element-wise.
This `Covariance` class supports singular covariance matrices. When
computing ``_log_pdet``, non-positive elements of :math:`d` are
ignored. Whitening is not well defined when the point to be whitened
does not lie in the span of the columns of the covariance matrix. The
convention taken here is to treat the inverse square root of
non-positive elements of :math:`d` as zeros.
Examples
--------
Prepare a symmetric positive definite covariance matrix ``A`` and a
data point ``x``.
>>> import numpy as np
>>> from scipy import stats
>>> rng = np.random.default_rng()
>>> n = 5
>>> A = np.diag(rng.random(n))
>>> x = rng.random(size=n)
Extract the diagonal from ``A`` and create the `Covariance` object.
>>> d = np.diag(A)
>>> cov = stats.Covariance.from_diagonal(d)
Compare the functionality of the `Covariance` object against a
reference implementations.
>>> res = cov.whiten(x)
>>> ref = np.diag(d**-0.5) @ x
>>> np.allclose(res, ref)
True
>>> res = cov.log_pdet
>>> ref = np.linalg.slogdet(A)[-1]
>>> np.allclose(res, ref)
True
"""
return CovViaDiagonal(diagonal)
@staticmethod
def from_precision(precision, covariance=None):
r"""
Return a representation of a covariance from its precision matrix.
Parameters
----------
precision : array_like
The precision matrix; that is, the inverse of a square, symmetric,
positive definite covariance matrix.
covariance : array_like, optional
The square, symmetric, positive definite covariance matrix. If not
provided, this may need to be calculated (e.g. to evaluate the
cumulative distribution function of
`scipy.stats.multivariate_normal`) by inverting `precision`.
Notes
-----
Let the covariance matrix be :math:`A`, its precision matrix be
:math:`P = A^{-1}`, and :math:`L` be the lower Cholesky factor such
that :math:`L L^T = P`.
Whitening of a data point :math:`x` is performed by computing
:math:`x^T L`. :math:`\log\det{A}` is calculated as
:math:`-2tr(\log{L})`, where the :math:`\log` operation is performed
element-wise.
This `Covariance` class does not support singular covariance matrices
because the precision matrix does not exist for a singular covariance
matrix.
Examples
--------
Prepare a symmetric positive definite precision matrix ``P`` and a
data point ``x``. (If the precision matrix is not already available,
consider the other factory methods of the `Covariance` class.)
>>> import numpy as np
>>> from scipy import stats
>>> rng = np.random.default_rng()
>>> n = 5
>>> P = rng.random(size=(n, n))
>>> P = P @ P.T # a precision matrix must be positive definite
>>> x = rng.random(size=n)
Create the `Covariance` object.
>>> cov = stats.Covariance.from_precision(P)
Compare the functionality of the `Covariance` object against
reference implementations.
>>> res = cov.whiten(x)
>>> ref = x @ np.linalg.cholesky(P)
>>> np.allclose(res, ref)
True
>>> res = cov.log_pdet
>>> ref = -np.linalg.slogdet(P)[-1]
>>> np.allclose(res, ref)
True
"""
return CovViaPrecision(precision, covariance)
@staticmethod
def from_cholesky(cholesky):
r"""
Representation of a covariance provided via the (lower) Cholesky factor
Parameters
----------
cholesky : array_like
The lower triangular Cholesky factor of the covariance matrix.
Notes
-----
Let the covariance matrix be :math:`A` and :math:`L` be the lower
Cholesky factor such that :math:`L L^T = A`.
Whitening of a data point :math:`x` is performed by computing
:math:`L^{-1} x`. :math:`\log\det{A}` is calculated as
:math:`2tr(\log{L})`, where the :math:`\log` operation is performed
element-wise.
This `Covariance` class does not support singular covariance matrices
because the Cholesky decomposition does not exist for a singular
covariance matrix.
Examples
--------
Prepare a symmetric positive definite covariance matrix ``A`` and a
data point ``x``.
>>> import numpy as np
>>> from scipy import stats
>>> rng = np.random.default_rng()
>>> n = 5
>>> A = rng.random(size=(n, n))
>>> A = A @ A.T # make the covariance symmetric positive definite
>>> x = rng.random(size=n)
Perform the Cholesky decomposition of ``A`` and create the
`Covariance` object.
>>> L = np.linalg.cholesky(A)
>>> cov = stats.Covariance.from_cholesky(L)
Compare the functionality of the `Covariance` object against
reference implementation.
>>> from scipy.linalg import solve_triangular
>>> res = cov.whiten(x)
>>> ref = solve_triangular(L, x, lower=True)
>>> np.allclose(res, ref)
True
>>> res = cov.log_pdet
>>> ref = np.linalg.slogdet(A)[-1]
>>> np.allclose(res, ref)
True
"""
return CovViaCholesky(cholesky)
@staticmethod
def from_eigendecomposition(eigendecomposition):
r"""
Representation of a covariance provided via eigendecomposition
Parameters
----------
eigendecomposition : sequence
A sequence (nominally a tuple) containing the eigenvalue and
eigenvector arrays as computed by `scipy.linalg.eigh` or
`numpy.linalg.eigh`.
Notes
-----
Let the covariance matrix be :math:`A`, let :math:`V` be matrix of
eigenvectors, and let :math:`W` be the diagonal matrix of eigenvalues
such that `V W V^T = A`.
When all of the eigenvalues are strictly positive, whitening of a
data point :math:`x` is performed by computing
:math:`x^T (V W^{-1/2})`, where the inverse square root can be taken
element-wise.
:math:`\log\det{A}` is calculated as :math:`tr(\log{W})`,
where the :math:`\log` operation is performed element-wise.
This `Covariance` class supports singular covariance matrices. When
computing ``_log_pdet``, non-positive eigenvalues are ignored.
Whitening is not well defined when the point to be whitened
does not lie in the span of the columns of the covariance matrix. The
convention taken here is to treat the inverse square root of
non-positive eigenvalues as zeros.
Examples
--------
Prepare a symmetric positive definite covariance matrix ``A`` and a
data point ``x``.
>>> import numpy as np
>>> from scipy import stats
>>> rng = np.random.default_rng()
>>> n = 5
>>> A = rng.random(size=(n, n))
>>> A = A @ A.T # make the covariance symmetric positive definite
>>> x = rng.random(size=n)
Perform the eigendecomposition of ``A`` and create the `Covariance`
object.
>>> w, v = np.linalg.eigh(A)
>>> cov = stats.Covariance.from_eigendecomposition((w, v))
Compare the functionality of the `Covariance` object against
reference implementations.
>>> res = cov.whiten(x)
>>> ref = x @ (v @ np.diag(w**-0.5))
>>> np.allclose(res, ref)
True
>>> res = cov.log_pdet
>>> ref = np.linalg.slogdet(A)[-1]
>>> np.allclose(res, ref)
True
"""
return CovViaEigendecomposition(eigendecomposition)
def whiten(self, x):
"""
Perform a whitening transformation on data.
"Whitening" ("white" as in "white noise", in which each frequency has
equal magnitude) transforms a set of random variables into a new set of
random variables with unit-diagonal covariance. When a whitening
transform is applied to a sample of points distributed according to
a multivariate normal distribution with zero mean, the covariance of
the transformed sample is approximately the identity matrix.
Parameters
----------
x : array_like
An array of points. The last dimension must correspond with the
dimensionality of the space, i.e., the number of columns in the
covariance matrix.
Returns
-------
x_ : array_like
The transformed array of points.
References
----------
.. [1] "Whitening Transformation". Wikipedia.
https://en.wikipedia.org/wiki/Whitening_transformation
.. [2] Novak, Lukas, and Miroslav Vorechovsky. "Generalization of
coloring linear transformation". Transactions of VSB 18.2
(2018): 31-35. :doi:`10.31490/tces-2018-0013`
Examples
--------
>>> import numpy as np
>>> from scipy import stats
>>> rng = np.random.default_rng()
>>> n = 3
>>> A = rng.random(size=(n, n))
>>> cov_array = A @ A.T # make matrix symmetric positive definite
>>> precision = np.linalg.inv(cov_array)
>>> cov_object = stats.Covariance.from_precision(precision)
>>> x = rng.multivariate_normal(np.zeros(n), cov_array, size=(10000))
>>> x_ = cov_object.whiten(x)
>>> np.cov(x_, rowvar=False) # near-identity covariance
array([[0.97862122, 0.00893147, 0.02430451],
[0.00893147, 0.96719062, 0.02201312],
[0.02430451, 0.02201312, 0.99206881]])
"""
return self._whiten(np.asarray(x))
def colorize(self, x):
"""
Perform a colorizing transformation on data.
"Colorizing" ("color" as in "colored noise", in which different
frequencies may have different magnitudes) transforms a set of
uncorrelated random variables into a new set of random variables with
the desired covariance. When a coloring transform is applied to a
sample of points distributed according to a multivariate normal
distribution with identity covariance and zero mean, the covariance of
the transformed sample is approximately the covariance matrix used
in the coloring transform.
Parameters
----------
x : array_like
An array of points. The last dimension must correspond with the
dimensionality of the space, i.e., the number of columns in the
covariance matrix.
Returns
-------
x_ : array_like
The transformed array of points.
References
----------
.. [1] "Whitening Transformation". Wikipedia.
https://en.wikipedia.org/wiki/Whitening_transformation
.. [2] Novak, Lukas, and Miroslav Vorechovsky. "Generalization of
coloring linear transformation". Transactions of VSB 18.2
(2018): 31-35. :doi:`10.31490/tces-2018-0013`
Examples
--------
>>> import numpy as np
>>> from scipy import stats
>>> rng = np.random.default_rng(1638083107694713882823079058616272161)
>>> n = 3
>>> A = rng.random(size=(n, n))
>>> cov_array = A @ A.T # make matrix symmetric positive definite
>>> cholesky = np.linalg.cholesky(cov_array)
>>> cov_object = stats.Covariance.from_cholesky(cholesky)
>>> x = rng.multivariate_normal(np.zeros(n), np.eye(n), size=(10000))
>>> x_ = cov_object.colorize(x)
>>> cov_data = np.cov(x_, rowvar=False)
>>> np.allclose(cov_data, cov_array, rtol=3e-2)
True
"""
return self._colorize(np.asarray(x))
@property
def log_pdet(self):
"""
Log of the pseudo-determinant of the covariance matrix
"""
return np.array(self._log_pdet, dtype=float)[()]
@property
def rank(self):
"""
Rank of the covariance matrix
"""
return np.array(self._rank, dtype=int)[()]
@property
def covariance(self):
"""
Explicit representation of the covariance matrix
"""
return self._covariance
@property
def shape(self):
"""
Shape of the covariance array
"""
return self._shape
def _validate_matrix(self, A, name):
A = np.atleast_2d(A)
m, n = A.shape[-2:]
if m != n or A.ndim != 2 or not (np.issubdtype(A.dtype, np.integer) or
np.issubdtype(A.dtype, np.floating)):
message = (f"The input `{name}` must be a square, "
"two-dimensional array of real numbers.")
raise ValueError(message)
return A
def _validate_vector(self, A, name):
A = np.atleast_1d(A)
if A.ndim != 1 or not (np.issubdtype(A.dtype, np.integer) or
np.issubdtype(A.dtype, np.floating)):
message = (f"The input `{name}` must be a one-dimensional array "
"of real numbers.")
raise ValueError(message)
return A
class CovViaPrecision(Covariance):
def __init__(self, precision, covariance=None):
precision = self._validate_matrix(precision, 'precision')
if covariance is not None:
covariance = self._validate_matrix(covariance, 'covariance')
message = "`precision.shape` must equal `covariance.shape`."
if precision.shape != covariance.shape:
raise ValueError(message)
self._chol_P = np.linalg.cholesky(precision)
self._log_pdet = -2*np.log(np.diag(self._chol_P)).sum(axis=-1)
self._rank = precision.shape[-1] # must be full rank if invertible
self._precision = precision
self._cov_matrix = covariance
self._shape = precision.shape
self._allow_singular = False
def _whiten(self, x):
return x @ self._chol_P
@cached_property
def _covariance(self):
n = self._shape[-1]
return (linalg.cho_solve((self._chol_P, True), np.eye(n))
if self._cov_matrix is None else self._cov_matrix)
def _colorize(self, x):
return linalg.solve_triangular(self._chol_P.T, x.T, lower=False).T
def _dot_diag(x, d):
# If d were a full diagonal matrix, x @ d would always do what we want.
# Special treatment is needed for n-dimensional `d` in which each row
# includes only the diagonal elements of a covariance matrix.
return x * d if x.ndim < 2 else x * np.expand_dims(d, -2)
class CovViaDiagonal(Covariance):
def __init__(self, diagonal):
diagonal = self._validate_vector(diagonal, 'diagonal')
i_zero = diagonal <= 0
positive_diagonal = np.array(diagonal, dtype=np.float64)
positive_diagonal[i_zero] = 1 # ones don't affect determinant
self._log_pdet = np.sum(np.log(positive_diagonal), axis=-1)
psuedo_reciprocals = 1 / np.sqrt(positive_diagonal)
psuedo_reciprocals[i_zero] = 0
self._sqrt_diagonal = np.sqrt(diagonal)
self._LP = psuedo_reciprocals
self._rank = positive_diagonal.shape[-1] - i_zero.sum(axis=-1)
self._covariance = np.apply_along_axis(np.diag, -1, diagonal)
self._i_zero = i_zero
self._shape = self._covariance.shape
self._allow_singular = True
def _whiten(self, x):
return _dot_diag(x, self._LP)
def _colorize(self, x):
return _dot_diag(x, self._sqrt_diagonal)
def _support_mask(self, x):
"""
Check whether x lies in the support of the distribution.
"""
return ~np.any(_dot_diag(x, self._i_zero), axis=-1)
class CovViaCholesky(Covariance):
def __init__(self, cholesky):
L = self._validate_matrix(cholesky, 'cholesky')
self._factor = L
self._log_pdet = 2*np.log(np.diag(self._factor)).sum(axis=-1)
self._rank = L.shape[-1] # must be full rank for cholesky
self._shape = L.shape
self._allow_singular = False
@cached_property
def _covariance(self):
return self._factor @ self._factor.T
def _whiten(self, x):
res = linalg.solve_triangular(self._factor, x.T, lower=True).T
return res
def _colorize(self, x):
return x @ self._factor.T
class CovViaEigendecomposition(Covariance):
def __init__(self, eigendecomposition):
eigenvalues, eigenvectors = eigendecomposition
eigenvalues = self._validate_vector(eigenvalues, 'eigenvalues')
eigenvectors = self._validate_matrix(eigenvectors, 'eigenvectors')
message = ("The shapes of `eigenvalues` and `eigenvectors` "
"must be compatible.")
try:
eigenvalues = np.expand_dims(eigenvalues, -2)
eigenvectors, eigenvalues = np.broadcast_arrays(eigenvectors,
eigenvalues)
eigenvalues = eigenvalues[..., 0, :]
except ValueError:
raise ValueError(message)
i_zero = eigenvalues <= 0
positive_eigenvalues = np.array(eigenvalues, dtype=np.float64)
positive_eigenvalues[i_zero] = 1 # ones don't affect determinant
self._log_pdet = np.sum(np.log(positive_eigenvalues), axis=-1)
psuedo_reciprocals = 1 / np.sqrt(positive_eigenvalues)
psuedo_reciprocals[i_zero] = 0
self._LP = eigenvectors * psuedo_reciprocals
self._LA = eigenvectors * np.sqrt(eigenvalues)
self._rank = positive_eigenvalues.shape[-1] - i_zero.sum(axis=-1)
self._w = eigenvalues
self._v = eigenvectors
self._shape = eigenvectors.shape
self._null_basis = eigenvectors * i_zero
# This is only used for `_support_mask`, not to decide whether
# the covariance is singular or not.
self._eps = _multivariate._eigvalsh_to_eps(eigenvalues) * 10**3
self._allow_singular = True
def _whiten(self, x):
return x @ self._LP
def _colorize(self, x):
return x @ self._LA.T
@cached_property
def _covariance(self):
return (self._v * self._w) @ self._v.T
def _support_mask(self, x):
"""
Check whether x lies in the support of the distribution.
"""
residual = np.linalg.norm(x @ self._null_basis, axis=-1)
in_support = residual < self._eps
return in_support
class CovViaPSD(Covariance):
"""
Representation of a covariance provided via an instance of _PSD
"""
def __init__(self, psd):
self._LP = psd.U
self._log_pdet = psd.log_pdet
self._rank = psd.rank
self._covariance = psd._M
self._shape = psd._M.shape
self._psd = psd
self._allow_singular = False # by default
def _whiten(self, x):
return x @ self._LP
def _support_mask(self, x):
return self._psd._support_mask(x)

View File

@ -0,0 +1,204 @@
import numpy as np
from scipy.sparse import coo_matrix
from scipy._lib._bunch import _make_tuple_bunch
CrosstabResult = _make_tuple_bunch(
"CrosstabResult", ["elements", "count"]
)
def crosstab(*args, levels=None, sparse=False):
"""
Return table of counts for each possible unique combination in ``*args``.
When ``len(args) > 1``, the array computed by this function is
often referred to as a *contingency table* [1]_.
The arguments must be sequences with the same length. The second return
value, `count`, is an integer array with ``len(args)`` dimensions. If
`levels` is None, the shape of `count` is ``(n0, n1, ...)``, where ``nk``
is the number of unique elements in ``args[k]``.
Parameters
----------
*args : sequences
A sequence of sequences whose unique aligned elements are to be
counted. The sequences in args must all be the same length.
levels : sequence, optional
If `levels` is given, it must be a sequence that is the same length as
`args`. Each element in `levels` is either a sequence or None. If it
is a sequence, it gives the values in the corresponding sequence in
`args` that are to be counted. If any value in the sequences in `args`
does not occur in the corresponding sequence in `levels`, that value
is ignored and not counted in the returned array `count`. The default
value of `levels` for ``args[i]`` is ``np.unique(args[i])``
sparse : bool, optional
If True, return a sparse matrix. The matrix will be an instance of
the `scipy.sparse.coo_matrix` class. Because SciPy's sparse matrices
must be 2-d, only two input sequences are allowed when `sparse` is
True. Default is False.
Returns
-------
res : CrosstabResult
An object containing the following attributes:
elements : tuple of numpy.ndarrays.
Tuple of length ``len(args)`` containing the arrays of elements
that are counted in `count`. These can be interpreted as the
labels of the corresponding dimensions of `count`. If `levels` was
given, then if ``levels[i]`` is not None, ``elements[i]`` will
hold the values given in ``levels[i]``.
count : numpy.ndarray or scipy.sparse.coo_matrix
Counts of the unique elements in ``zip(*args)``, stored in an
array. Also known as a *contingency table* when ``len(args) > 1``.
See Also
--------
numpy.unique
Notes
-----
.. versionadded:: 1.7.0
References
----------
.. [1] "Contingency table", http://en.wikipedia.org/wiki/Contingency_table
Examples
--------
>>> from scipy.stats.contingency import crosstab
Given the lists `a` and `x`, create a contingency table that counts the
frequencies of the corresponding pairs.
>>> a = ['A', 'B', 'A', 'A', 'B', 'B', 'A', 'A', 'B', 'B']
>>> x = ['X', 'X', 'X', 'Y', 'Z', 'Z', 'Y', 'Y', 'Z', 'Z']
>>> res = crosstab(a, x)
>>> avals, xvals = res.elements
>>> avals
array(['A', 'B'], dtype='<U1')
>>> xvals
array(['X', 'Y', 'Z'], dtype='<U1')
>>> res.count
array([[2, 3, 0],
[1, 0, 4]])
So `('A', 'X')` occurs twice, `('A', 'Y')` occurs three times, etc.
Higher dimensional contingency tables can be created.
>>> p = [0, 0, 0, 0, 1, 1, 1, 0, 0, 1]
>>> res = crosstab(a, x, p)
>>> res.count
array([[[2, 0],
[2, 1],
[0, 0]],
[[1, 0],
[0, 0],
[1, 3]]])
>>> res.count.shape
(2, 3, 2)
The values to be counted can be set by using the `levels` argument.
It allows the elements of interest in each input sequence to be
given explicitly instead finding the unique elements of the sequence.
For example, suppose one of the arguments is an array containing the
answers to a survey question, with integer values 1 to 4. Even if the
value 1 does not occur in the data, we want an entry for it in the table.
>>> q1 = [2, 3, 3, 2, 4, 4, 2, 3, 4, 4, 4, 3, 3, 3, 4] # 1 does not occur.
>>> q2 = [4, 4, 2, 2, 2, 4, 1, 1, 2, 2, 4, 2, 2, 2, 4] # 3 does not occur.
>>> options = [1, 2, 3, 4]
>>> res = crosstab(q1, q2, levels=(options, options))
>>> res.count
array([[0, 0, 0, 0],
[1, 1, 0, 1],
[1, 4, 0, 1],
[0, 3, 0, 3]])
If `levels` is given, but an element of `levels` is None, the unique values
of the corresponding argument are used. For example,
>>> res = crosstab(q1, q2, levels=(None, options))
>>> res.elements
[array([2, 3, 4]), [1, 2, 3, 4]]
>>> res.count
array([[1, 1, 0, 1],
[1, 4, 0, 1],
[0, 3, 0, 3]])
If we want to ignore the pairs where 4 occurs in ``q2``, we can
give just the values [1, 2] to `levels`, and the 4 will be ignored:
>>> res = crosstab(q1, q2, levels=(None, [1, 2]))
>>> res.elements
[array([2, 3, 4]), [1, 2]]
>>> res.count
array([[1, 1],
[1, 4],
[0, 3]])
Finally, let's repeat the first example, but return a sparse matrix:
>>> res = crosstab(a, x, sparse=True)
>>> res.count
<COOrdinate sparse matrix of dtype 'int64'
with 4 stored elements and shape (2, 3)>
>>> res.count.toarray()
array([[2, 3, 0],
[1, 0, 4]])
"""
nargs = len(args)
if nargs == 0:
raise TypeError("At least one input sequence is required.")
len0 = len(args[0])
if not all(len(a) == len0 for a in args[1:]):
raise ValueError("All input sequences must have the same length.")
if sparse and nargs != 2:
raise ValueError("When `sparse` is True, only two input sequences "
"are allowed.")
if levels is None:
# Call np.unique with return_inverse=True on each argument.
actual_levels, indices = zip(*[np.unique(a, return_inverse=True)
for a in args])
else:
# `levels` is not None...
if len(levels) != nargs:
raise ValueError('len(levels) must equal the number of input '
'sequences')
args = [np.asarray(arg) for arg in args]
mask = np.zeros((nargs, len0), dtype=np.bool_)
inv = np.zeros((nargs, len0), dtype=np.intp)
actual_levels = []
for k, (levels_list, arg) in enumerate(zip(levels, args)):
if levels_list is None:
levels_list, inv[k, :] = np.unique(arg, return_inverse=True)
mask[k, :] = True
else:
q = arg == np.asarray(levels_list).reshape(-1, 1)
mask[k, :] = np.any(q, axis=0)
qnz = q.T.nonzero()
inv[k, qnz[0]] = qnz[1]
actual_levels.append(levels_list)
mask_all = mask.all(axis=0)
indices = tuple(inv[:, mask_all])
if sparse:
count = coo_matrix((np.ones(len(indices[0]), dtype=int),
(indices[0], indices[1])))
count.sum_duplicates()
else:
shape = [len(u) for u in actual_levels]
count = np.zeros(shape, dtype=int)
np.add.at(count, indices, 1)
return CrosstabResult(actual_levels, count)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,292 @@
"""
Sane parameters for stats.distributions.
"""
import numpy as np
distcont = [
['alpha', (3.5704770516650459,)],
['anglit', ()],
['arcsine', ()],
['argus', (1.0,)],
['beta', (2.3098496451481823, 0.62687954300963677)],
['betaprime', (5, 6)],
['bradford', (0.29891359763170633,)],
['burr', (10.5, 4.3)],
['burr12', (10, 4)],
['cauchy', ()],
['chi', (78,)],
['chi2', (55,)],
['cosine', ()],
['crystalball', (2.0, 3.0)],
['dgamma', (1.1023326088288166,)],
['dweibull', (2.0685080649914673,)],
['erlang', (10,)],
['expon', ()],
['exponnorm', (1.5,)],
['exponpow', (2.697119160358469,)],
['exponweib', (2.8923945291034436, 1.9505288745913174)],
['f', (29, 18)],
['fatiguelife', (29,)], # correction numargs = 1
['fisk', (3.0857548622253179,)],
['foldcauchy', (4.7164673455831894,)],
['foldnorm', (1.9521253373555869,)],
['gamma', (1.9932305483800778,)],
['gausshyper', (13.763771604130699, 3.1189636648681431,
2.5145980350183019, 5.1811649903971615)], # veryslow
['genexpon', (9.1325976465418908, 16.231956600590632, 3.2819552690843983)],
['genextreme', (-0.1,)],
['gengamma', (4.4162385429431925, 3.1193091679242761)],
['gengamma', (4.4162385429431925, -3.1193091679242761)],
['genhalflogistic', (0.77274727809929322,)],
['genhyperbolic', (0.5, 1.5, -0.5,)],
['geninvgauss', (2.3, 1.5)],
['genlogistic', (0.41192440799679475,)],
['gennorm', (1.2988442399460265,)],
['halfgennorm', (0.6748054997000371,)],
['genpareto', (0.1,)], # use case with finite moments
['gibrat', ()],
['gompertz', (0.94743713075105251,)],
['gumbel_l', ()],
['gumbel_r', ()],
['halfcauchy', ()],
['halflogistic', ()],
['halfnorm', ()],
['hypsecant', ()],
['invgamma', (4.0668996136993067,)],
['invgauss', (0.14546264555347513,)],
['invweibull', (10.58,)],
['irwinhall', (10,)],
['jf_skew_t', (8, 4)],
['johnsonsb', (4.3172675099141058, 3.1837781130785063)],
['johnsonsu', (2.554395574161155, 2.2482281679651965)],
['kappa4', (0.0, 0.0)],
['kappa4', (-0.1, 0.1)],
['kappa4', (0.0, 0.1)],
['kappa4', (0.1, 0.0)],
['kappa3', (1.0,)],
['ksone', (1000,)], # replace 22 by 100 to avoid failing range, ticket 956
['kstwo', (10,)],
['kstwobign', ()],
['laplace', ()],
['laplace_asymmetric', (2,)],
['levy', ()],
['levy_l', ()],
['levy_stable', (1.8, -0.5)],
['loggamma', (0.41411931826052117,)],
['logistic', ()],
['loglaplace', (3.2505926592051435,)],
['lognorm', (0.95368226960575331,)],
['loguniform', (0.01, 1.25)],
['lomax', (1.8771398388773268,)],
['maxwell', ()],
['mielke', (10.4, 4.6)],
['moyal', ()],
['nakagami', (4.9673794866666237,)],
['ncf', (27, 27, 0.41578441799226107)],
['nct', (14, 0.24045031331198066)],
['ncx2', (21, 1.0560465975116415)],
['norm', ()],
['norminvgauss', (1.25, 0.5)],
['pareto', (2.621716532144454,)],
['pearson3', (0.1,)],
['pearson3', (-2,)],
['powerlaw', (1.6591133289905851,)],
['powerlaw', (0.6591133289905851,)],
['powerlognorm', (2.1413923530064087, 0.44639540782048337)],
['powernorm', (4.4453652254590779,)],
['rayleigh', ()],
['rdist', (1.6,)],
['recipinvgauss', (0.63004267809369119,)],
['reciprocal', (0.01, 1.25)],
['rel_breitwigner', (36.545206797050334, )],
['rice', (0.7749725210111873,)],
['semicircular', ()],
['skewcauchy', (0.5,)],
['skewnorm', (4.0,)],
['studentized_range', (3.0, 10.0)],
['t', (2.7433514990818093,)],
['trapezoid', (0.2, 0.8)],
['triang', (0.15785029824528218,)],
['truncexpon', (4.6907725456810478,)],
['truncnorm', (-1.0978730080013919, 2.7306754109031979)],
['truncnorm', (0.1, 2.)],
['truncpareto', (1.8, 5.3)],
['truncpareto', (2, 5)],
['truncweibull_min', (2.5, 0.25, 1.75)],
['tukeylambda', (3.1321477856738267,)],
['uniform', ()],
['vonmises', (3.9939042581071398,)],
['vonmises_line', (3.9939042581071398,)],
['wald', ()],
['weibull_max', (2.8687961709100187,)],
['weibull_min', (1.7866166930421596,)],
['wrapcauchy', (0.031071279018614728,)]]
distdiscrete = [
['bernoulli',(0.3,)],
['betabinom', (5, 2.3, 0.63)],
['betanbinom', (5, 9.3, 1)],
['binom', (5, 0.4)],
['boltzmann',(1.4, 19)],
['dlaplace', (0.8,)], # 0.5
['geom', (0.5,)],
['hypergeom',(30, 12, 6)],
['hypergeom',(21,3,12)], # numpy.random (3,18,12) numpy ticket:921
['hypergeom',(21,18,11)], # numpy.random (18,3,11) numpy ticket:921
['nchypergeom_fisher', (140, 80, 60, 0.5)],
['nchypergeom_wallenius', (140, 80, 60, 0.5)],
['logser', (0.6,)], # re-enabled, numpy ticket:921
['nbinom', (0.4, 0.4)], # from tickets: 583
['nbinom', (5, 0.5)],
['planck', (0.51,)], # 4.1
['poisson', (0.6,)],
['randint', (7, 31)],
['skellam', (15, 8)],
['zipf', (6.6,)],
['zipfian', (0.75, 15)],
['zipfian', (1.25, 10)],
['yulesimon', (11.0,)],
['nhypergeom', (20, 7, 1)]
]
invdistdiscrete = [
# In each of the following, at least one shape parameter is invalid
['hypergeom', (3, 3, 4)],
['nhypergeom', (5, 2, 8)],
['nchypergeom_fisher', (3, 3, 4, 1)],
['nchypergeom_wallenius', (3, 3, 4, 1)],
['bernoulli', (1.5, )],
['binom', (10, 1.5)],
['betabinom', (10, -0.4, -0.5)],
['betanbinom', (10, -0.4, -0.5)],
['boltzmann', (-1, 4)],
['dlaplace', (-0.5, )],
['geom', (1.5, )],
['logser', (1.5, )],
['nbinom', (10, 1.5)],
['planck', (-0.5, )],
['poisson', (-0.5, )],
['randint', (5, 2)],
['skellam', (-5, -2)],
['zipf', (-2, )],
['yulesimon', (-2, )],
['zipfian', (-0.75, 15)]
]
invdistcont = [
# In each of the following, at least one shape parameter is invalid
['alpha', (-1, )],
['anglit', ()],
['arcsine', ()],
['argus', (-1, )],
['beta', (-2, 2)],
['betaprime', (-2, 2)],
['bradford', (-1, )],
['burr', (-1, 1)],
['burr12', (-1, 1)],
['cauchy', ()],
['chi', (-1, )],
['chi2', (-1, )],
['cosine', ()],
['crystalball', (-1, 2)],
['dgamma', (-1, )],
['dweibull', (-1, )],
['erlang', (-1, )],
['expon', ()],
['exponnorm', (-1, )],
['exponweib', (1, -1)],
['exponpow', (-1, )],
['f', (10, -10)],
['fatiguelife', (-1, )],
['fisk', (-1, )],
['foldcauchy', (-1, )],
['foldnorm', (-1, )],
['genlogistic', (-1, )],
['gennorm', (-1, )],
['genpareto', (np.inf, )],
['genexpon', (1, 2, -3)],
['genextreme', (np.inf, )],
['genhyperbolic', (0.5, -0.5, -1.5,)],
['gausshyper', (1, 2, 3, -4)],
['gamma', (-1, )],
['gengamma', (-1, 0)],
['genhalflogistic', (-1, )],
['geninvgauss', (1, 0)],
['gibrat', ()],
['gompertz', (-1, )],
['gumbel_r', ()],
['gumbel_l', ()],
['halfcauchy', ()],
['halflogistic', ()],
['halfnorm', ()],
['halfgennorm', (-1, )],
['hypsecant', ()],
['invgamma', (-1, )],
['invgauss', (-1, )],
['invweibull', (-1, )],
['irwinhall', (-1,)],
['irwinhall', (0,)],
['irwinhall', (2.5,)],
['jf_skew_t', (-1, 0)],
['johnsonsb', (1, -2)],
['johnsonsu', (1, -2)],
['kappa4', (np.nan, 0)],
['kappa3', (-1, )],
['ksone', (-1, )],
['kstwo', (-1, )],
['kstwobign', ()],
['laplace', ()],
['laplace_asymmetric', (-1, )],
['levy', ()],
['levy_l', ()],
['levy_stable', (-1, 1)],
['logistic', ()],
['loggamma', (-1, )],
['loglaplace', (-1, )],
['lognorm', (-1, )],
['loguniform', (10, 5)],
['lomax', (-1, )],
['maxwell', ()],
['mielke', (1, -2)],
['moyal', ()],
['nakagami', (-1, )],
['ncx2', (-1, 2)],
['ncf', (10, 20, -1)],
['nct', (-1, 2)],
['norm', ()],
['norminvgauss', (5, -10)],
['pareto', (-1, )],
['pearson3', (np.nan, )],
['powerlaw', (-1, )],
['powerlognorm', (1, -2)],
['powernorm', (-1, )],
['rdist', (-1, )],
['rayleigh', ()],
['rice', (-1, )],
['recipinvgauss', (-1, )],
['semicircular', ()],
['skewnorm', (np.inf, )],
['studentized_range', (-1, 1)],
['rel_breitwigner', (-2, )],
['t', (-1, )],
['trapezoid', (0, 2)],
['triang', (2, )],
['truncexpon', (-1, )],
['truncnorm', (10, 5)],
['truncpareto', (-1, 5)],
['truncpareto', (1.8, .5)],
['truncweibull_min', (-2.5, 0.25, 1.75)],
['tukeylambda', (np.nan, )],
['uniform', ()],
['vonmises', (-1, )],
['vonmises_line', (-1, )],
['wald', ()],
['weibull_min', (-1, )],
['weibull_max', (-1, )],
['wrapcauchy', (2, )],
['reciprocal', (15, 10)],
['skewcauchy', (2, )]
]

View File

@ -0,0 +1,426 @@
"""
Created on Fri Apr 2 09:06:05 2021
@author: matth
"""
from __future__ import annotations
import math
import numpy as np
from scipy import special
from ._axis_nan_policy import _axis_nan_policy_factory, _broadcast_arrays
from scipy._lib._array_api import array_namespace
__all__ = ['entropy', 'differential_entropy']
@_axis_nan_policy_factory(
lambda x: x,
n_samples=lambda kwgs: (
2 if ("qk" in kwgs and kwgs["qk"] is not None)
else 1
),
n_outputs=1, result_to_tuple=lambda x: (x,), paired=True,
too_small=-1 # entropy doesn't have too small inputs
)
def entropy(pk: np.typing.ArrayLike,
qk: np.typing.ArrayLike | None = None,
base: float | None = None,
axis: int = 0
) -> np.number | np.ndarray:
"""
Calculate the Shannon entropy/relative entropy of given distribution(s).
If only probabilities `pk` are given, the Shannon entropy is calculated as
``H = -sum(pk * log(pk))``.
If `qk` is not None, then compute the relative entropy
``D = sum(pk * log(pk / qk))``. This quantity is also known
as the Kullback-Leibler divergence.
This routine will normalize `pk` and `qk` if they don't sum to 1.
Parameters
----------
pk : array_like
Defines the (discrete) distribution. Along each axis-slice of ``pk``,
element ``i`` is the (possibly unnormalized) probability of event
``i``.
qk : array_like, optional
Sequence against which the relative entropy is computed. Should be in
the same format as `pk`.
base : float, optional
The logarithmic base to use, defaults to ``e`` (natural logarithm).
axis : int, optional
The axis along which the entropy is calculated. Default is 0.
Returns
-------
S : {float, array_like}
The calculated entropy.
Notes
-----
Informally, the Shannon entropy quantifies the expected uncertainty
inherent in the possible outcomes of a discrete random variable.
For example,
if messages consisting of sequences of symbols from a set are to be
encoded and transmitted over a noiseless channel, then the Shannon entropy
``H(pk)`` gives a tight lower bound for the average number of units of
information needed per symbol if the symbols occur with frequencies
governed by the discrete distribution `pk` [1]_. The choice of base
determines the choice of units; e.g., ``e`` for nats, ``2`` for bits, etc.
The relative entropy, ``D(pk|qk)``, quantifies the increase in the average
number of units of information needed per symbol if the encoding is
optimized for the probability distribution `qk` instead of the true
distribution `pk`. Informally, the relative entropy quantifies the expected
excess in surprise experienced if one believes the true distribution is
`qk` when it is actually `pk`.
A related quantity, the cross entropy ``CE(pk, qk)``, satisfies the
equation ``CE(pk, qk) = H(pk) + D(pk|qk)`` and can also be calculated with
the formula ``CE = -sum(pk * log(qk))``. It gives the average
number of units of information needed per symbol if an encoding is
optimized for the probability distribution `qk` when the true distribution
is `pk`. It is not computed directly by `entropy`, but it can be computed
using two calls to the function (see Examples).
See [2]_ for more information.
References
----------
.. [1] Shannon, C.E. (1948), A Mathematical Theory of Communication.
Bell System Technical Journal, 27: 379-423.
https://doi.org/10.1002/j.1538-7305.1948.tb01338.x
.. [2] Thomas M. Cover and Joy A. Thomas. 2006. Elements of Information
Theory (Wiley Series in Telecommunications and Signal Processing).
Wiley-Interscience, USA.
Examples
--------
The outcome of a fair coin is the most uncertain:
>>> import numpy as np
>>> from scipy.stats import entropy
>>> base = 2 # work in units of bits
>>> pk = np.array([1/2, 1/2]) # fair coin
>>> H = entropy(pk, base=base)
>>> H
1.0
>>> H == -np.sum(pk * np.log(pk)) / np.log(base)
True
The outcome of a biased coin is less uncertain:
>>> qk = np.array([9/10, 1/10]) # biased coin
>>> entropy(qk, base=base)
0.46899559358928117
The relative entropy between the fair coin and biased coin is calculated
as:
>>> D = entropy(pk, qk, base=base)
>>> D
0.7369655941662062
>>> D == np.sum(pk * np.log(pk/qk)) / np.log(base)
True
The cross entropy can be calculated as the sum of the entropy and
relative entropy`:
>>> CE = entropy(pk, base=base) + entropy(pk, qk, base=base)
>>> CE
1.736965594166206
>>> CE == -np.sum(pk * np.log(qk)) / np.log(base)
True
"""
if base is not None and base <= 0:
raise ValueError("`base` must be a positive number or `None`.")
xp = array_namespace(pk) if qk is None else array_namespace(pk, qk)
pk = xp.asarray(pk)
with np.errstate(invalid='ignore'):
pk = 1.0*pk / xp.sum(pk, axis=axis, keepdims=True) # type: ignore[operator]
if qk is None:
vec = special.entr(pk)
else:
qk = xp.asarray(qk)
pk, qk = _broadcast_arrays((pk, qk), axis=None, xp=xp) # don't ignore any axes
sum_kwargs = dict(axis=axis, keepdims=True)
qk = 1.0*qk / xp.sum(qk, **sum_kwargs) # type: ignore[operator, call-overload]
vec = special.rel_entr(pk, qk)
S = xp.sum(vec, axis=axis)
if base is not None:
S /= math.log(base)
return S
def _differential_entropy_is_too_small(samples, kwargs, axis=-1):
values = samples[0]
n = values.shape[axis]
window_length = kwargs.get("window_length",
math.floor(math.sqrt(n) + 0.5))
if not 2 <= 2 * window_length < n:
return True
return False
@_axis_nan_policy_factory(
lambda x: x, n_outputs=1, result_to_tuple=lambda x: (x,),
too_small=_differential_entropy_is_too_small
)
def differential_entropy(
values: np.typing.ArrayLike,
*,
window_length: int | None = None,
base: float | None = None,
axis: int = 0,
method: str = "auto",
) -> np.number | np.ndarray:
r"""Given a sample of a distribution, estimate the differential entropy.
Several estimation methods are available using the `method` parameter. By
default, a method is selected based the size of the sample.
Parameters
----------
values : sequence
Sample from a continuous distribution.
window_length : int, optional
Window length for computing Vasicek estimate. Must be an integer
between 1 and half of the sample size. If ``None`` (the default), it
uses the heuristic value
.. math::
\left \lfloor \sqrt{n} + 0.5 \right \rfloor
where :math:`n` is the sample size. This heuristic was originally
proposed in [2]_ and has become common in the literature.
base : float, optional
The logarithmic base to use, defaults to ``e`` (natural logarithm).
axis : int, optional
The axis along which the differential entropy is calculated.
Default is 0.
method : {'vasicek', 'van es', 'ebrahimi', 'correa', 'auto'}, optional
The method used to estimate the differential entropy from the sample.
Default is ``'auto'``. See Notes for more information.
Returns
-------
entropy : float
The calculated differential entropy.
Notes
-----
This function will converge to the true differential entropy in the limit
.. math::
n \to \infty, \quad m \to \infty, \quad \frac{m}{n} \to 0
The optimal choice of ``window_length`` for a given sample size depends on
the (unknown) distribution. Typically, the smoother the density of the
distribution, the larger the optimal value of ``window_length`` [1]_.
The following options are available for the `method` parameter.
* ``'vasicek'`` uses the estimator presented in [1]_. This is
one of the first and most influential estimators of differential entropy.
* ``'van es'`` uses the bias-corrected estimator presented in [3]_, which
is not only consistent but, under some conditions, asymptotically normal.
* ``'ebrahimi'`` uses an estimator presented in [4]_, which was shown
in simulation to have smaller bias and mean squared error than
the Vasicek estimator.
* ``'correa'`` uses the estimator presented in [5]_ based on local linear
regression. In a simulation study, it had consistently smaller mean
square error than the Vasiceck estimator, but it is more expensive to
compute.
* ``'auto'`` selects the method automatically (default). Currently,
this selects ``'van es'`` for very small samples (<10), ``'ebrahimi'``
for moderate sample sizes (11-1000), and ``'vasicek'`` for larger
samples, but this behavior is subject to change in future versions.
All estimators are implemented as described in [6]_.
References
----------
.. [1] Vasicek, O. (1976). A test for normality based on sample entropy.
Journal of the Royal Statistical Society:
Series B (Methodological), 38(1), 54-59.
.. [2] Crzcgorzewski, P., & Wirczorkowski, R. (1999). Entropy-based
goodness-of-fit test for exponentiality. Communications in
Statistics-Theory and Methods, 28(5), 1183-1202.
.. [3] Van Es, B. (1992). Estimating functionals related to a density by a
class of statistics based on spacings. Scandinavian Journal of
Statistics, 61-72.
.. [4] Ebrahimi, N., Pflughoeft, K., & Soofi, E. S. (1994). Two measures
of sample entropy. Statistics & Probability Letters, 20(3), 225-234.
.. [5] Correa, J. C. (1995). A new estimator of entropy. Communications
in Statistics-Theory and Methods, 24(10), 2439-2449.
.. [6] Noughabi, H. A. (2015). Entropy Estimation Using Numerical Methods.
Annals of Data Science, 2(2), 231-241.
https://link.springer.com/article/10.1007/s40745-015-0045-9
Examples
--------
>>> import numpy as np
>>> from scipy.stats import differential_entropy, norm
Entropy of a standard normal distribution:
>>> rng = np.random.default_rng()
>>> values = rng.standard_normal(100)
>>> differential_entropy(values)
1.3407817436640392
Compare with the true entropy:
>>> float(norm.entropy())
1.4189385332046727
For several sample sizes between 5 and 1000, compare the accuracy of
the ``'vasicek'``, ``'van es'``, and ``'ebrahimi'`` methods. Specifically,
compare the root mean squared error (over 1000 trials) between the estimate
and the true differential entropy of the distribution.
>>> from scipy import stats
>>> import matplotlib.pyplot as plt
>>>
>>>
>>> def rmse(res, expected):
... '''Root mean squared error'''
... return np.sqrt(np.mean((res - expected)**2))
>>>
>>>
>>> a, b = np.log10(5), np.log10(1000)
>>> ns = np.round(np.logspace(a, b, 10)).astype(int)
>>> reps = 1000 # number of repetitions for each sample size
>>> expected = stats.expon.entropy()
>>>
>>> method_errors = {'vasicek': [], 'van es': [], 'ebrahimi': []}
>>> for method in method_errors:
... for n in ns:
... rvs = stats.expon.rvs(size=(reps, n), random_state=rng)
... res = stats.differential_entropy(rvs, method=method, axis=-1)
... error = rmse(res, expected)
... method_errors[method].append(error)
>>>
>>> for method, errors in method_errors.items():
... plt.loglog(ns, errors, label=method)
>>>
>>> plt.legend()
>>> plt.xlabel('sample size')
>>> plt.ylabel('RMSE (1000 trials)')
>>> plt.title('Entropy Estimator Error (Exponential Distribution)')
"""
values = np.asarray(values)
values = np.moveaxis(values, axis, -1)
n = values.shape[-1] # number of observations
if window_length is None:
window_length = math.floor(math.sqrt(n) + 0.5)
if not 2 <= 2 * window_length < n:
raise ValueError(
f"Window length ({window_length}) must be positive and less "
f"than half the sample size ({n}).",
)
if base is not None and base <= 0:
raise ValueError("`base` must be a positive number or `None`.")
sorted_data = np.sort(values, axis=-1)
methods = {"vasicek": _vasicek_entropy,
"van es": _van_es_entropy,
"correa": _correa_entropy,
"ebrahimi": _ebrahimi_entropy,
"auto": _vasicek_entropy}
method = method.lower()
if method not in methods:
message = f"`method` must be one of {set(methods)}"
raise ValueError(message)
if method == "auto":
if n <= 10:
method = 'van es'
elif n <= 1000:
method = 'ebrahimi'
else:
method = 'vasicek'
res = methods[method](sorted_data, window_length)
if base is not None:
res /= np.log(base)
return res
def _pad_along_last_axis(X, m):
"""Pad the data for computing the rolling window difference."""
# scales a bit better than method in _vasicek_like_entropy
shape = np.array(X.shape)
shape[-1] = m
Xl = np.broadcast_to(X[..., [0]], shape) # [0] vs 0 to maintain shape
Xr = np.broadcast_to(X[..., [-1]], shape)
return np.concatenate((Xl, X, Xr), axis=-1)
def _vasicek_entropy(X, m):
"""Compute the Vasicek estimator as described in [6] Eq. 1.3."""
n = X.shape[-1]
X = _pad_along_last_axis(X, m)
differences = X[..., 2 * m:] - X[..., : -2 * m:]
logs = np.log(n/(2*m) * differences)
return np.mean(logs, axis=-1)
def _van_es_entropy(X, m):
"""Compute the van Es estimator as described in [6]."""
# No equation number, but referred to as HVE_mn.
# Typo: there should be a log within the summation.
n = X.shape[-1]
difference = X[..., m:] - X[..., :-m]
term1 = 1/(n-m) * np.sum(np.log((n+1)/m * difference), axis=-1)
k = np.arange(m, n+1)
return term1 + np.sum(1/k) + np.log(m) - np.log(n+1)
def _ebrahimi_entropy(X, m):
"""Compute the Ebrahimi estimator as described in [6]."""
# No equation number, but referred to as HE_mn
n = X.shape[-1]
X = _pad_along_last_axis(X, m)
differences = X[..., 2 * m:] - X[..., : -2 * m:]
i = np.arange(1, n+1).astype(float)
ci = np.ones_like(i)*2
ci[i <= m] = 1 + (i[i <= m] - 1)/m
ci[i >= n - m + 1] = 1 + (n - i[i >= n-m+1])/m
logs = np.log(n * differences / (ci * m))
return np.mean(logs, axis=-1)
def _correa_entropy(X, m):
"""Compute the Correa estimator as described in [6]."""
# No equation number, but referred to as HC_mn
n = X.shape[-1]
X = _pad_along_last_axis(X, m)
i = np.arange(1, n+1)
dj = np.arange(-m, m+1)[:, None]
j = i + dj
j0 = j + m - 1 # 0-indexed version of j
Xibar = np.mean(X[..., j0], axis=-2, keepdims=True)
difference = X[..., j0] - Xibar
num = np.sum(difference*dj, axis=-2) # dj is d-i
den = n*np.sum(difference**2, axis=-2)
return -np.mean(np.log(num/den), axis=-1)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,725 @@
#-------------------------------------------------------------------------------
#
# Define classes for (uni/multi)-variate kernel density estimation.
#
# Currently, only Gaussian kernels are implemented.
#
# Written by: Robert Kern
#
# Date: 2004-08-09
#
# Modified: 2005-02-10 by Robert Kern.
# Contributed to SciPy
# 2005-10-07 by Robert Kern.
# Some fixes to match the new scipy_core
#
# Copyright 2004-2005 by Enthought, Inc.
#
#-------------------------------------------------------------------------------
# Standard library imports.
import warnings
# SciPy imports.
from scipy import linalg, special
from scipy._lib._util import check_random_state
from numpy import (asarray, atleast_2d, reshape, zeros, newaxis, exp, pi,
sqrt, ravel, power, atleast_1d, squeeze, sum, transpose,
ones, cov)
import numpy as np
# Local imports.
from . import _mvn
from ._stats import gaussian_kernel_estimate, gaussian_kernel_estimate_log
__all__ = ['gaussian_kde']
class gaussian_kde:
"""Representation of a kernel-density estimate using Gaussian kernels.
Kernel density estimation is a way to estimate the probability density
function (PDF) of a random variable in a non-parametric way.
`gaussian_kde` works for both uni-variate and multi-variate data. It
includes automatic bandwidth determination. The estimation works best for
a unimodal distribution; bimodal or multi-modal distributions tend to be
oversmoothed.
Parameters
----------
dataset : array_like
Datapoints to estimate from. In case of univariate data this is a 1-D
array, otherwise a 2-D array with shape (# of dims, # of data).
bw_method : str, scalar or callable, optional
The method used to calculate the estimator bandwidth. This can be
'scott', 'silverman', a scalar constant or a callable. If a scalar,
this will be used directly as `kde.factor`. If a callable, it should
take a `gaussian_kde` instance as only parameter and return a scalar.
If None (default), 'scott' is used. See Notes for more details.
weights : array_like, optional
weights of datapoints. This must be the same shape as dataset.
If None (default), the samples are assumed to be equally weighted
Attributes
----------
dataset : ndarray
The dataset with which `gaussian_kde` was initialized.
d : int
Number of dimensions.
n : int
Number of datapoints.
neff : int
Effective number of datapoints.
.. versionadded:: 1.2.0
factor : float
The bandwidth factor, obtained from `kde.covariance_factor`. The square
of `kde.factor` multiplies the covariance matrix of the data in the kde
estimation.
covariance : ndarray
The covariance matrix of `dataset`, scaled by the calculated bandwidth
(`kde.factor`).
inv_cov : ndarray
The inverse of `covariance`.
Methods
-------
evaluate
__call__
integrate_gaussian
integrate_box_1d
integrate_box
integrate_kde
pdf
logpdf
resample
set_bandwidth
covariance_factor
Notes
-----
Bandwidth selection strongly influences the estimate obtained from the KDE
(much more so than the actual shape of the kernel). Bandwidth selection
can be done by a "rule of thumb", by cross-validation, by "plug-in
methods" or by other means; see [3]_, [4]_ for reviews. `gaussian_kde`
uses a rule of thumb, the default is Scott's Rule.
Scott's Rule [1]_, implemented as `scotts_factor`, is::
n**(-1./(d+4)),
with ``n`` the number of data points and ``d`` the number of dimensions.
In the case of unequally weighted points, `scotts_factor` becomes::
neff**(-1./(d+4)),
with ``neff`` the effective number of datapoints.
Silverman's Rule [2]_, implemented as `silverman_factor`, is::
(n * (d + 2) / 4.)**(-1. / (d + 4)).
or in the case of unequally weighted points::
(neff * (d + 2) / 4.)**(-1. / (d + 4)).
Good general descriptions of kernel density estimation can be found in [1]_
and [2]_, the mathematics for this multi-dimensional implementation can be
found in [1]_.
With a set of weighted samples, the effective number of datapoints ``neff``
is defined by::
neff = sum(weights)^2 / sum(weights^2)
as detailed in [5]_.
`gaussian_kde` does not currently support data that lies in a
lower-dimensional subspace of the space in which it is expressed. For such
data, consider performing principle component analysis / dimensionality
reduction and using `gaussian_kde` with the transformed data.
References
----------
.. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and
Visualization", John Wiley & Sons, New York, Chicester, 1992.
.. [2] B.W. Silverman, "Density Estimation for Statistics and Data
Analysis", Vol. 26, Monographs on Statistics and Applied Probability,
Chapman and Hall, London, 1986.
.. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A
Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993.
.. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel
conditional density estimation", Computational Statistics & Data
Analysis, Vol. 36, pp. 279-298, 2001.
.. [5] Gray P. G., 1969, Journal of the Royal Statistical Society.
Series A (General), 132, 272
Examples
--------
Generate some random two-dimensional data:
>>> import numpy as np
>>> from scipy import stats
>>> def measure(n):
... "Measurement model, return two coupled measurements."
... m1 = np.random.normal(size=n)
... m2 = np.random.normal(scale=0.5, size=n)
... return m1+m2, m1-m2
>>> m1, m2 = measure(2000)
>>> xmin = m1.min()
>>> xmax = m1.max()
>>> ymin = m2.min()
>>> ymax = m2.max()
Perform a kernel density estimate on the data:
>>> X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
>>> positions = np.vstack([X.ravel(), Y.ravel()])
>>> values = np.vstack([m1, m2])
>>> kernel = stats.gaussian_kde(values)
>>> Z = np.reshape(kernel(positions).T, X.shape)
Plot the results:
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots()
>>> ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
... extent=[xmin, xmax, ymin, ymax])
>>> ax.plot(m1, m2, 'k.', markersize=2)
>>> ax.set_xlim([xmin, xmax])
>>> ax.set_ylim([ymin, ymax])
>>> plt.show()
"""
def __init__(self, dataset, bw_method=None, weights=None):
self.dataset = atleast_2d(asarray(dataset))
if not self.dataset.size > 1:
raise ValueError("`dataset` input should have multiple elements.")
self.d, self.n = self.dataset.shape
if weights is not None:
self._weights = atleast_1d(weights).astype(float)
self._weights /= sum(self._weights)
if self.weights.ndim != 1:
raise ValueError("`weights` input should be one-dimensional.")
if len(self._weights) != self.n:
raise ValueError("`weights` input should be of length n")
self._neff = 1/sum(self._weights**2)
# This can be converted to a warning once gh-10205 is resolved
if self.d > self.n:
msg = ("Number of dimensions is greater than number of samples. "
"This results in a singular data covariance matrix, which "
"cannot be treated using the algorithms implemented in "
"`gaussian_kde`. Note that `gaussian_kde` interprets each "
"*column* of `dataset` to be a point; consider transposing "
"the input to `dataset`.")
raise ValueError(msg)
try:
self.set_bandwidth(bw_method=bw_method)
except linalg.LinAlgError as e:
msg = ("The data appears to lie in a lower-dimensional subspace "
"of the space in which it is expressed. This has resulted "
"in a singular data covariance matrix, which cannot be "
"treated using the algorithms implemented in "
"`gaussian_kde`. Consider performing principle component "
"analysis / dimensionality reduction and using "
"`gaussian_kde` with the transformed data.")
raise linalg.LinAlgError(msg) from e
def evaluate(self, points):
"""Evaluate the estimated pdf on a set of points.
Parameters
----------
points : (# of dimensions, # of points)-array
Alternatively, a (# of dimensions,) vector can be passed in and
treated as a single point.
Returns
-------
values : (# of points,)-array
The values at each point.
Raises
------
ValueError : if the dimensionality of the input points is different than
the dimensionality of the KDE.
"""
points = atleast_2d(asarray(points))
d, m = points.shape
if d != self.d:
if d == 1 and m == self.d:
# points was passed in as a row vector
points = reshape(points, (self.d, 1))
m = 1
else:
msg = (f"points have dimension {d}, "
f"dataset has dimension {self.d}")
raise ValueError(msg)
output_dtype, spec = _get_output_dtype(self.covariance, points)
result = gaussian_kernel_estimate[spec](
self.dataset.T, self.weights[:, None],
points.T, self.cho_cov, output_dtype)
return result[:, 0]
__call__ = evaluate
def integrate_gaussian(self, mean, cov):
"""
Multiply estimated density by a multivariate Gaussian and integrate
over the whole space.
Parameters
----------
mean : aray_like
A 1-D array, specifying the mean of the Gaussian.
cov : array_like
A 2-D array, specifying the covariance matrix of the Gaussian.
Returns
-------
result : scalar
The value of the integral.
Raises
------
ValueError
If the mean or covariance of the input Gaussian differs from
the KDE's dimensionality.
"""
mean = atleast_1d(squeeze(mean))
cov = atleast_2d(cov)
if mean.shape != (self.d,):
raise ValueError("mean does not have dimension %s" % self.d)
if cov.shape != (self.d, self.d):
raise ValueError("covariance does not have dimension %s" % self.d)
# make mean a column vector
mean = mean[:, newaxis]
sum_cov = self.covariance + cov
# This will raise LinAlgError if the new cov matrix is not s.p.d
# cho_factor returns (ndarray, bool) where bool is a flag for whether
# or not ndarray is upper or lower triangular
sum_cov_chol = linalg.cho_factor(sum_cov)
diff = self.dataset - mean
tdiff = linalg.cho_solve(sum_cov_chol, diff)
sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
energies = sum(diff * tdiff, axis=0) / 2.0
result = sum(exp(-energies)*self.weights, axis=0) / norm_const
return result
def integrate_box_1d(self, low, high):
"""
Computes the integral of a 1D pdf between two bounds.
Parameters
----------
low : scalar
Lower bound of integration.
high : scalar
Upper bound of integration.
Returns
-------
value : scalar
The result of the integral.
Raises
------
ValueError
If the KDE is over more than one dimension.
"""
if self.d != 1:
raise ValueError("integrate_box_1d() only handles 1D pdfs")
stdev = ravel(sqrt(self.covariance))[0]
normalized_low = ravel((low - self.dataset) / stdev)
normalized_high = ravel((high - self.dataset) / stdev)
value = np.sum(self.weights*(
special.ndtr(normalized_high) -
special.ndtr(normalized_low)))
return value
def integrate_box(self, low_bounds, high_bounds, maxpts=None):
"""Computes the integral of a pdf over a rectangular interval.
Parameters
----------
low_bounds : array_like
A 1-D array containing the lower bounds of integration.
high_bounds : array_like
A 1-D array containing the upper bounds of integration.
maxpts : int, optional
The maximum number of points to use for integration.
Returns
-------
value : scalar
The result of the integral.
"""
if maxpts is not None:
extra_kwds = {'maxpts': maxpts}
else:
extra_kwds = {}
value, inform = _mvn.mvnun_weighted(low_bounds, high_bounds,
self.dataset, self.weights,
self.covariance, **extra_kwds)
if inform:
msg = ('An integral in _mvn.mvnun requires more points than %s' %
(self.d * 1000))
warnings.warn(msg, stacklevel=2)
return value
def integrate_kde(self, other):
"""
Computes the integral of the product of this kernel density estimate
with another.
Parameters
----------
other : gaussian_kde instance
The other kde.
Returns
-------
value : scalar
The result of the integral.
Raises
------
ValueError
If the KDEs have different dimensionality.
"""
if other.d != self.d:
raise ValueError("KDEs are not the same dimensionality")
# we want to iterate over the smallest number of points
if other.n < self.n:
small = other
large = self
else:
small = self
large = other
sum_cov = small.covariance + large.covariance
sum_cov_chol = linalg.cho_factor(sum_cov)
result = 0.0
for i in range(small.n):
mean = small.dataset[:, i, newaxis]
diff = large.dataset - mean
tdiff = linalg.cho_solve(sum_cov_chol, diff)
energies = sum(diff * tdiff, axis=0) / 2.0
result += sum(exp(-energies)*large.weights, axis=0)*small.weights[i]
sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
result /= norm_const
return result
def resample(self, size=None, seed=None):
"""Randomly sample a dataset from the estimated pdf.
Parameters
----------
size : int, optional
The number of samples to draw. If not provided, then the size is
the same as the effective number of samples in the underlying
dataset.
seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
If `seed` is None (or `np.random`), the `numpy.random.RandomState`
singleton is used.
If `seed` is an int, a new ``RandomState`` instance is used,
seeded with `seed`.
If `seed` is already a ``Generator`` or ``RandomState`` instance then
that instance is used.
Returns
-------
resample : (self.d, `size`) ndarray
The sampled dataset.
""" # numpy/numpydoc#87 # noqa: E501
if size is None:
size = int(self.neff)
random_state = check_random_state(seed)
norm = transpose(random_state.multivariate_normal(
zeros((self.d,), float), self.covariance, size=size
))
indices = random_state.choice(self.n, size=size, p=self.weights)
means = self.dataset[:, indices]
return means + norm
def scotts_factor(self):
"""Compute Scott's factor.
Returns
-------
s : float
Scott's factor.
"""
return power(self.neff, -1./(self.d+4))
def silverman_factor(self):
"""Compute the Silverman factor.
Returns
-------
s : float
The silverman factor.
"""
return power(self.neff*(self.d+2.0)/4.0, -1./(self.d+4))
# Default method to calculate bandwidth, can be overwritten by subclass
covariance_factor = scotts_factor
covariance_factor.__doc__ = """Computes the coefficient (`kde.factor`) that
multiplies the data covariance matrix to obtain the kernel covariance
matrix. The default is `scotts_factor`. A subclass can overwrite this
method to provide a different method, or set it through a call to
`kde.set_bandwidth`."""
def set_bandwidth(self, bw_method=None):
"""Compute the estimator bandwidth with given method.
The new bandwidth calculated after a call to `set_bandwidth` is used
for subsequent evaluations of the estimated density.
Parameters
----------
bw_method : str, scalar or callable, optional
The method used to calculate the estimator bandwidth. This can be
'scott', 'silverman', a scalar constant or a callable. If a
scalar, this will be used directly as `kde.factor`. If a callable,
it should take a `gaussian_kde` instance as only parameter and
return a scalar. If None (default), nothing happens; the current
`kde.covariance_factor` method is kept.
Notes
-----
.. versionadded:: 0.11
Examples
--------
>>> import numpy as np
>>> import scipy.stats as stats
>>> x1 = np.array([-7, -5, 1, 4, 5.])
>>> kde = stats.gaussian_kde(x1)
>>> xs = np.linspace(-10, 10, num=50)
>>> y1 = kde(xs)
>>> kde.set_bandwidth(bw_method='silverman')
>>> y2 = kde(xs)
>>> kde.set_bandwidth(bw_method=kde.factor / 3.)
>>> y3 = kde(xs)
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots()
>>> ax.plot(x1, np.full(x1.shape, 1 / (4. * x1.size)), 'bo',
... label='Data points (rescaled)')
>>> ax.plot(xs, y1, label='Scott (default)')
>>> ax.plot(xs, y2, label='Silverman')
>>> ax.plot(xs, y3, label='Const (1/3 * Silverman)')
>>> ax.legend()
>>> plt.show()
"""
if bw_method is None:
pass
elif bw_method == 'scott':
self.covariance_factor = self.scotts_factor
elif bw_method == 'silverman':
self.covariance_factor = self.silverman_factor
elif np.isscalar(bw_method) and not isinstance(bw_method, str):
self._bw_method = 'use constant'
self.covariance_factor = lambda: bw_method
elif callable(bw_method):
self._bw_method = bw_method
self.covariance_factor = lambda: self._bw_method(self)
else:
msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
"or a callable."
raise ValueError(msg)
self._compute_covariance()
def _compute_covariance(self):
"""Computes the covariance matrix for each Gaussian kernel using
covariance_factor().
"""
self.factor = self.covariance_factor()
# Cache covariance and Cholesky decomp of covariance
if not hasattr(self, '_data_cho_cov'):
self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
bias=False,
aweights=self.weights))
self._data_cho_cov = linalg.cholesky(self._data_covariance,
lower=True)
self.covariance = self._data_covariance * self.factor**2
self.cho_cov = (self._data_cho_cov * self.factor).astype(np.float64)
self.log_det = 2*np.log(np.diag(self.cho_cov
* np.sqrt(2*pi))).sum()
@property
def inv_cov(self):
# Re-compute from scratch each time because I'm not sure how this is
# used in the wild. (Perhaps users change the `dataset`, since it's
# not a private attribute?) `_compute_covariance` used to recalculate
# all these, so we'll recalculate everything now that this is a
# a property.
self.factor = self.covariance_factor()
self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
bias=False, aweights=self.weights))
return linalg.inv(self._data_covariance) / self.factor**2
def pdf(self, x):
"""
Evaluate the estimated pdf on a provided set of points.
Notes
-----
This is an alias for `gaussian_kde.evaluate`. See the ``evaluate``
docstring for more details.
"""
return self.evaluate(x)
def logpdf(self, x):
"""
Evaluate the log of the estimated pdf on a provided set of points.
"""
points = atleast_2d(x)
d, m = points.shape
if d != self.d:
if d == 1 and m == self.d:
# points was passed in as a row vector
points = reshape(points, (self.d, 1))
m = 1
else:
msg = (f"points have dimension {d}, "
f"dataset has dimension {self.d}")
raise ValueError(msg)
output_dtype, spec = _get_output_dtype(self.covariance, points)
result = gaussian_kernel_estimate_log[spec](
self.dataset.T, self.weights[:, None],
points.T, self.cho_cov, output_dtype)
return result[:, 0]
def marginal(self, dimensions):
"""Return a marginal KDE distribution
Parameters
----------
dimensions : int or 1-d array_like
The dimensions of the multivariate distribution corresponding
with the marginal variables, that is, the indices of the dimensions
that are being retained. The other dimensions are marginalized out.
Returns
-------
marginal_kde : gaussian_kde
An object representing the marginal distribution.
Notes
-----
.. versionadded:: 1.10.0
"""
dims = np.atleast_1d(dimensions)
if not np.issubdtype(dims.dtype, np.integer):
msg = ("Elements of `dimensions` must be integers - the indices "
"of the marginal variables being retained.")
raise ValueError(msg)
n = len(self.dataset) # number of dimensions
original_dims = dims.copy()
dims[dims < 0] = n + dims[dims < 0]
if len(np.unique(dims)) != len(dims):
msg = ("All elements of `dimensions` must be unique.")
raise ValueError(msg)
i_invalid = (dims < 0) | (dims >= n)
if np.any(i_invalid):
msg = (f"Dimensions {original_dims[i_invalid]} are invalid "
f"for a distribution in {n} dimensions.")
raise ValueError(msg)
dataset = self.dataset[dims]
weights = self.weights
return gaussian_kde(dataset, bw_method=self.covariance_factor(),
weights=weights)
@property
def weights(self):
try:
return self._weights
except AttributeError:
self._weights = ones(self.n)/self.n
return self._weights
@property
def neff(self):
try:
return self._neff
except AttributeError:
self._neff = 1/sum(self.weights**2)
return self._neff
def _get_output_dtype(covariance, points):
"""
Calculates the output dtype and the "spec" (=C type name).
This was necessary in order to deal with the fused types in the Cython
routine `gaussian_kernel_estimate`. See gh-10824 for details.
"""
output_dtype = np.common_type(covariance, points)
itemsize = np.dtype(output_dtype).itemsize
if itemsize == 4:
spec = 'float'
elif itemsize == 8:
spec = 'double'
elif itemsize in (12, 16):
spec = 'long double'
else:
raise ValueError(
f"{output_dtype} has unexpected item size: {itemsize}"
)
return output_dtype, spec

View File

@ -0,0 +1,600 @@
# Compute the two-sided one-sample Kolmogorov-Smirnov Prob(Dn <= d) where:
# D_n = sup_x{|F_n(x) - F(x)|},
# F_n(x) is the empirical CDF for a sample of size n {x_i: i=1,...,n},
# F(x) is the CDF of a probability distribution.
#
# Exact methods:
# Prob(D_n >= d) can be computed via a matrix algorithm of Durbin[1]
# or a recursion algorithm due to Pomeranz[2].
# Marsaglia, Tsang & Wang[3] gave a computation-efficient way to perform
# the Durbin algorithm.
# D_n >= d <==> D_n+ >= d or D_n- >= d (the one-sided K-S statistics), hence
# Prob(D_n >= d) = 2*Prob(D_n+ >= d) - Prob(D_n+ >= d and D_n- >= d).
# For d > 0.5, the latter intersection probability is 0.
#
# Approximate methods:
# For d close to 0.5, ignoring that intersection term may still give a
# reasonable approximation.
# Li-Chien[4] and Korolyuk[5] gave an asymptotic formula extending
# Kolmogorov's initial asymptotic, suitable for large d. (See
# scipy.special.kolmogorov for that asymptotic)
# Pelz-Good[6] used the functional equation for Jacobi theta functions to
# transform the Li-Chien/Korolyuk formula produce a computational formula
# suitable for small d.
#
# Simard and L'Ecuyer[7] provided an algorithm to decide when to use each of
# the above approaches and it is that which is used here.
#
# Other approaches:
# Carvalho[8] optimizes Durbin's matrix algorithm for large values of d.
# Moscovich and Nadler[9] use FFTs to compute the convolutions.
# References:
# [1] Durbin J (1968).
# "The Probability that the Sample Distribution Function Lies Between Two
# Parallel Straight Lines."
# Annals of Mathematical Statistics, 39, 398-411.
# [2] Pomeranz J (1974).
# "Exact Cumulative Distribution of the Kolmogorov-Smirnov Statistic for
# Small Samples (Algorithm 487)."
# Communications of the ACM, 17(12), 703-704.
# [3] Marsaglia G, Tsang WW, Wang J (2003).
# "Evaluating Kolmogorov's Distribution."
# Journal of Statistical Software, 8(18), 1-4.
# [4] LI-CHIEN, C. (1956).
# "On the exact distribution of the statistics of A. N. Kolmogorov and
# their asymptotic expansion."
# Acta Matematica Sinica, 6, 55-81.
# [5] KOROLYUK, V. S. (1960).
# "Asymptotic analysis of the distribution of the maximum deviation in
# the Bernoulli scheme."
# Theor. Probability Appl., 4, 339-366.
# [6] Pelz W, Good IJ (1976).
# "Approximating the Lower Tail-areas of the Kolmogorov-Smirnov One-sample
# Statistic."
# Journal of the Royal Statistical Society, Series B, 38(2), 152-156.
# [7] Simard, R., L'Ecuyer, P. (2011)
# "Computing the Two-Sided Kolmogorov-Smirnov Distribution",
# Journal of Statistical Software, Vol 39, 11, 1-18.
# [8] Carvalho, Luis (2015)
# "An Improved Evaluation of Kolmogorov's Distribution"
# Journal of Statistical Software, Code Snippets; Vol 65(3), 1-8.
# [9] Amit Moscovich, Boaz Nadler (2017)
# "Fast calculation of boundary crossing probabilities for Poisson
# processes",
# Statistics & Probability Letters, Vol 123, 177-182.
import numpy as np
import scipy.special
import scipy.special._ufuncs as scu
from scipy._lib._finite_differences import _derivative
_E128 = 128
_EP128 = np.ldexp(np.longdouble(1), _E128)
_EM128 = np.ldexp(np.longdouble(1), -_E128)
_SQRT2PI = np.sqrt(2 * np.pi)
_LOG_2PI = np.log(2 * np.pi)
_MIN_LOG = -708
_SQRT3 = np.sqrt(3)
_PI_SQUARED = np.pi ** 2
_PI_FOUR = np.pi ** 4
_PI_SIX = np.pi ** 6
# [Lifted from _loggamma.pxd.] If B_m are the Bernoulli numbers,
# then Stirling coeffs are B_{2j}/(2j)/(2j-1) for j=8,...1.
_STIRLING_COEFFS = [-2.955065359477124183e-2, 6.4102564102564102564e-3,
-1.9175269175269175269e-3, 8.4175084175084175084e-4,
-5.952380952380952381e-4, 7.9365079365079365079e-4,
-2.7777777777777777778e-3, 8.3333333333333333333e-2]
def _log_nfactorial_div_n_pow_n(n):
# Computes n! / n**n
# = (n-1)! / n**(n-1)
# Uses Stirling's approximation, but removes n*log(n) up-front to
# avoid subtractive cancellation.
# = log(n)/2 - n + log(sqrt(2pi)) + sum B_{2j}/(2j)/(2j-1)/n**(2j-1)
rn = 1.0/n
return np.log(n)/2 - n + _LOG_2PI/2 + rn * np.polyval(_STIRLING_COEFFS, rn/n)
def _clip_prob(p):
"""clips a probability to range 0<=p<=1."""
return np.clip(p, 0.0, 1.0)
def _select_and_clip_prob(cdfprob, sfprob, cdf=True):
"""Selects either the CDF or SF, and then clips to range 0<=p<=1."""
p = np.where(cdf, cdfprob, sfprob)
return _clip_prob(p)
def _kolmogn_DMTW(n, d, cdf=True):
r"""Computes the Kolmogorov CDF: Pr(D_n <= d) using the MTW approach to
the Durbin matrix algorithm.
Durbin (1968); Marsaglia, Tsang, Wang (2003). [1], [3].
"""
# Write d = (k-h)/n, where k is positive integer and 0 <= h < 1
# Generate initial matrix H of size m*m where m=(2k-1)
# Compute k-th row of (n!/n^n) * H^n, scaling intermediate results.
# Requires memory O(m^2) and computation O(m^2 log(n)).
# Most suitable for small m.
if d >= 1.0:
return _select_and_clip_prob(1.0, 0.0, cdf)
nd = n * d
if nd <= 0.5:
return _select_and_clip_prob(0.0, 1.0, cdf)
k = int(np.ceil(nd))
h = k - nd
m = 2 * k - 1
H = np.zeros([m, m])
# Initialize: v is first column (and last row) of H
# v[j] = (1-h^(j+1)/(j+1)! (except for v[-1])
# w[j] = 1/(j)!
# q = k-th row of H (actually i!/n^i*H^i)
intm = np.arange(1, m + 1)
v = 1.0 - h ** intm
w = np.empty(m)
fac = 1.0
for j in intm:
w[j - 1] = fac
fac /= j # This might underflow. Isn't a problem.
v[j - 1] *= fac
tt = max(2 * h - 1.0, 0)**m - 2*h**m
v[-1] = (1.0 + tt) * fac
for i in range(1, m):
H[i - 1:, i] = w[:m - i + 1]
H[:, 0] = v
H[-1, :] = np.flip(v, axis=0)
Hpwr = np.eye(np.shape(H)[0]) # Holds intermediate powers of H
nn = n
expnt = 0 # Scaling of Hpwr
Hexpnt = 0 # Scaling of H
while nn > 0:
if nn % 2:
Hpwr = np.matmul(Hpwr, H)
expnt += Hexpnt
H = np.matmul(H, H)
Hexpnt *= 2
# Scale as needed.
if np.abs(H[k - 1, k - 1]) > _EP128:
H /= _EP128
Hexpnt += _E128
nn = nn // 2
p = Hpwr[k - 1, k - 1]
# Multiply by n!/n^n
for i in range(1, n + 1):
p = i * p / n
if np.abs(p) < _EM128:
p *= _EP128
expnt -= _E128
# unscale
if expnt != 0:
p = np.ldexp(p, expnt)
return _select_and_clip_prob(p, 1.0-p, cdf)
def _pomeranz_compute_j1j2(i, n, ll, ceilf, roundf):
"""Compute the endpoints of the interval for row i."""
if i == 0:
j1, j2 = -ll - ceilf - 1, ll + ceilf - 1
else:
# i + 1 = 2*ip1div2 + ip1mod2
ip1div2, ip1mod2 = divmod(i + 1, 2)
if ip1mod2 == 0: # i is odd
if ip1div2 == n + 1:
j1, j2 = n - ll - ceilf - 1, n + ll + ceilf - 1
else:
j1, j2 = ip1div2 - 1 - ll - roundf - 1, ip1div2 + ll - 1 + ceilf - 1
else:
j1, j2 = ip1div2 - 1 - ll - 1, ip1div2 + ll + roundf - 1
return max(j1 + 2, 0), min(j2, n)
def _kolmogn_Pomeranz(n, x, cdf=True):
r"""Computes Pr(D_n <= d) using the Pomeranz recursion algorithm.
Pomeranz (1974) [2]
"""
# V is n*(2n+2) matrix.
# Each row is convolution of the previous row and probabilities from a
# Poisson distribution.
# Desired CDF probability is n! V[n-1, 2n+1] (final entry in final row).
# Only two rows are needed at any given stage:
# - Call them V0 and V1.
# - Swap each iteration
# Only a few (contiguous) entries in each row can be non-zero.
# - Keep track of start and end (j1 and j2 below)
# - V0s and V1s track the start in the two rows
# Scale intermediate results as needed.
# Only a few different Poisson distributions can occur
t = n * x
ll = int(np.floor(t))
f = 1.0 * (t - ll) # fractional part of t
g = min(f, 1.0 - f)
ceilf = (1 if f > 0 else 0)
roundf = (1 if f > 0.5 else 0)
npwrs = 2 * (ll + 1) # Maximum number of powers needed in convolutions
gpower = np.empty(npwrs) # gpower = (g/n)^m/m!
twogpower = np.empty(npwrs) # twogpower = (2g/n)^m/m!
onem2gpower = np.empty(npwrs) # onem2gpower = ((1-2g)/n)^m/m!
# gpower etc are *almost* Poisson probs, just missing normalizing factor.
gpower[0] = 1.0
twogpower[0] = 1.0
onem2gpower[0] = 1.0
expnt = 0
g_over_n, two_g_over_n, one_minus_two_g_over_n = g/n, 2*g/n, (1 - 2*g)/n
for m in range(1, npwrs):
gpower[m] = gpower[m - 1] * g_over_n / m
twogpower[m] = twogpower[m - 1] * two_g_over_n / m
onem2gpower[m] = onem2gpower[m - 1] * one_minus_two_g_over_n / m
V0 = np.zeros([npwrs])
V1 = np.zeros([npwrs])
V1[0] = 1 # first row
V0s, V1s = 0, 0 # start indices of the two rows
j1, j2 = _pomeranz_compute_j1j2(0, n, ll, ceilf, roundf)
for i in range(1, 2 * n + 2):
# Preserve j1, V1, V1s, V0s from last iteration
k1 = j1
V0, V1 = V1, V0
V0s, V1s = V1s, V0s
V1.fill(0.0)
j1, j2 = _pomeranz_compute_j1j2(i, n, ll, ceilf, roundf)
if i == 1 or i == 2 * n + 1:
pwrs = gpower
else:
pwrs = (twogpower if i % 2 else onem2gpower)
ln2 = j2 - k1 + 1
if ln2 > 0:
conv = np.convolve(V0[k1 - V0s:k1 - V0s + ln2], pwrs[:ln2])
conv_start = j1 - k1 # First index to use from conv
conv_len = j2 - j1 + 1 # Number of entries to use from conv
V1[:conv_len] = conv[conv_start:conv_start + conv_len]
# Scale to avoid underflow.
if 0 < np.max(V1) < _EM128:
V1 *= _EP128
expnt -= _E128
V1s = V0s + j1 - k1
# multiply by n!
ans = V1[n - V1s]
for m in range(1, n + 1):
if np.abs(ans) > _EP128:
ans *= _EM128
expnt += _E128
ans *= m
# Undo any intermediate scaling
if expnt != 0:
ans = np.ldexp(ans, expnt)
ans = _select_and_clip_prob(ans, 1.0 - ans, cdf)
return ans
def _kolmogn_PelzGood(n, x, cdf=True):
"""Computes the Pelz-Good approximation to Prob(Dn <= x) with 0<=x<=1.
Start with Li-Chien, Korolyuk approximation:
Prob(Dn <= x) ~ K0(z) + K1(z)/sqrt(n) + K2(z)/n + K3(z)/n**1.5
where z = x*sqrt(n).
Transform each K_(z) using Jacobi theta functions into a form suitable
for small z.
Pelz-Good (1976). [6]
"""
if x <= 0.0:
return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
if x >= 1.0:
return _select_and_clip_prob(1.0, 0.0, cdf=cdf)
z = np.sqrt(n) * x
zsquared, zthree, zfour, zsix = z**2, z**3, z**4, z**6
qlog = -_PI_SQUARED / 8 / zsquared
if qlog < _MIN_LOG: # z ~ 0.041743441416853426
return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
q = np.exp(qlog)
# Coefficients of terms in the sums for K1, K2 and K3
k1a = -zsquared
k1b = _PI_SQUARED / 4
k2a = 6 * zsix + 2 * zfour
k2b = (2 * zfour - 5 * zsquared) * _PI_SQUARED / 4
k2c = _PI_FOUR * (1 - 2 * zsquared) / 16
k3d = _PI_SIX * (5 - 30 * zsquared) / 64
k3c = _PI_FOUR * (-60 * zsquared + 212 * zfour) / 16
k3b = _PI_SQUARED * (135 * zfour - 96 * zsix) / 4
k3a = -30 * zsix - 90 * z**8
K0to3 = np.zeros(4)
# Use a Horner scheme to evaluate sum c_i q^(i^2)
# Reduces to a sum over odd integers.
maxk = int(np.ceil(16 * z / np.pi))
for k in range(maxk, 0, -1):
m = 2 * k - 1
msquared, mfour, msix = m**2, m**4, m**6
qpower = np.power(q, 8 * k)
coeffs = np.array([1.0,
k1a + k1b*msquared,
k2a + k2b*msquared + k2c*mfour,
k3a + k3b*msquared + k3c*mfour + k3d*msix])
K0to3 *= qpower
K0to3 += coeffs
K0to3 *= q
K0to3 *= _SQRT2PI
# z**10 > 0 as z > 0.04
K0to3 /= np.array([z, 6 * zfour, 72 * z**7, 6480 * z**10])
# Now do the other sum over the other terms, all integers k
# K_2: (pi^2 k^2) q^(k^2),
# K_3: (3pi^2 k^2 z^2 - pi^4 k^4)*q^(k^2)
# Don't expect much subtractive cancellation so use direct calculation
q = np.exp(-_PI_SQUARED / 2 / zsquared)
ks = np.arange(maxk, 0, -1)
ksquared = ks ** 2
sqrt3z = _SQRT3 * z
kspi = np.pi * ks
qpwers = q ** ksquared
k2extra = np.sum(ksquared * qpwers)
k2extra *= _PI_SQUARED * _SQRT2PI/(-36 * zthree)
K0to3[2] += k2extra
k3extra = np.sum((sqrt3z + kspi) * (sqrt3z - kspi) * ksquared * qpwers)
k3extra *= _PI_SQUARED * _SQRT2PI/(216 * zsix)
K0to3[3] += k3extra
powers_of_n = np.power(n * 1.0, np.arange(len(K0to3)) / 2.0)
K0to3 /= powers_of_n
if not cdf:
K0to3 *= -1
K0to3[0] += 1
Ksum = sum(K0to3)
return Ksum
def _kolmogn(n, x, cdf=True):
"""Computes the CDF(or SF) for the two-sided Kolmogorov-Smirnov statistic.
x must be of type float, n of type integer.
Simard & L'Ecuyer (2011) [7].
"""
if np.isnan(n):
return n # Keep the same type of nan
if int(n) != n or n <= 0:
return np.nan
if x >= 1.0:
return _select_and_clip_prob(1.0, 0.0, cdf=cdf)
if x <= 0.0:
return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
t = n * x
if t <= 1.0: # Ruben-Gambino: 1/2n <= x <= 1/n
if t <= 0.5:
return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
if n <= 140:
prob = np.prod(np.arange(1, n+1) * (1.0/n) * (2*t - 1))
else:
prob = np.exp(_log_nfactorial_div_n_pow_n(n) + n * np.log(2*t-1))
return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
if t >= n - 1: # Ruben-Gambino
prob = 2 * (1.0 - x)**n
return _select_and_clip_prob(1 - prob, prob, cdf=cdf)
if x >= 0.5: # Exact: 2 * smirnov
prob = 2 * scipy.special.smirnov(n, x)
return _select_and_clip_prob(1.0 - prob, prob, cdf=cdf)
nxsquared = t * x
if n <= 140:
if nxsquared <= 0.754693:
prob = _kolmogn_DMTW(n, x, cdf=True)
return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
if nxsquared <= 4:
prob = _kolmogn_Pomeranz(n, x, cdf=True)
return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
# Now use Miller approximation of 2*smirnov
prob = 2 * scipy.special.smirnov(n, x)
return _select_and_clip_prob(1.0 - prob, prob, cdf=cdf)
# Split CDF and SF as they have different cutoffs on nxsquared.
if not cdf:
if nxsquared >= 370.0:
return 0.0
if nxsquared >= 2.2:
prob = 2 * scipy.special.smirnov(n, x)
return _clip_prob(prob)
# Fall through and compute the SF as 1.0-CDF
if nxsquared >= 18.0:
cdfprob = 1.0
elif n <= 100000 and n * x**1.5 <= 1.4:
cdfprob = _kolmogn_DMTW(n, x, cdf=True)
else:
cdfprob = _kolmogn_PelzGood(n, x, cdf=True)
return _select_and_clip_prob(cdfprob, 1.0 - cdfprob, cdf=cdf)
def _kolmogn_p(n, x):
"""Computes the PDF for the two-sided Kolmogorov-Smirnov statistic.
x must be of type float, n of type integer.
"""
if np.isnan(n):
return n # Keep the same type of nan
if int(n) != n or n <= 0:
return np.nan
if x >= 1.0 or x <= 0:
return 0
t = n * x
if t <= 1.0:
# Ruben-Gambino: n!/n^n * (2t-1)^n -> 2 n!/n^n * n^2 * (2t-1)^(n-1)
if t <= 0.5:
return 0.0
if n <= 140:
prd = np.prod(np.arange(1, n) * (1.0 / n) * (2 * t - 1))
else:
prd = np.exp(_log_nfactorial_div_n_pow_n(n) + (n-1) * np.log(2 * t - 1))
return prd * 2 * n**2
if t >= n - 1:
# Ruben-Gambino : 1-2(1-x)**n -> 2n*(1-x)**(n-1)
return 2 * (1.0 - x) ** (n-1) * n
if x >= 0.5:
return 2 * scipy.stats.ksone.pdf(x, n)
# Just take a small delta.
# Ideally x +/- delta would stay within [i/n, (i+1)/n] for some integer a.
# as the CDF is a piecewise degree n polynomial.
# It has knots at 1/n, 2/n, ... (n-1)/n
# and is not a C-infinity function at the knots
delta = x / 2.0**16
delta = min(delta, x - 1.0/n)
delta = min(delta, 0.5 - x)
def _kk(_x):
return kolmogn(n, _x)
return _derivative(_kk, x, dx=delta, order=5)
def _kolmogni(n, p, q):
"""Computes the PPF/ISF of kolmogn.
n of type integer, n>= 1
p is the CDF, q the SF, p+q=1
"""
if np.isnan(n):
return n # Keep the same type of nan
if int(n) != n or n <= 0:
return np.nan
if p <= 0:
return 1.0/n
if q <= 0:
return 1.0
delta = np.exp((np.log(p) - scipy.special.loggamma(n+1))/n)
if delta <= 1.0/n:
return (delta + 1.0 / n) / 2
x = -np.expm1(np.log(q/2.0)/n)
if x >= 1 - 1.0/n:
return x
x1 = scu._kolmogci(p)/np.sqrt(n)
x1 = min(x1, 1.0 - 1.0/n)
def _f(x):
return _kolmogn(n, x) - p
return scipy.optimize.brentq(_f, 1.0/n, x1, xtol=1e-14)
def kolmogn(n, x, cdf=True):
"""Computes the CDF for the two-sided Kolmogorov-Smirnov distribution.
The two-sided Kolmogorov-Smirnov distribution has as its CDF Pr(D_n <= x),
for a sample of size n drawn from a distribution with CDF F(t), where
:math:`D_n &= sup_t |F_n(t) - F(t)|`, and
:math:`F_n(t)` is the Empirical Cumulative Distribution Function of the sample.
Parameters
----------
n : integer, array_like
the number of samples
x : float, array_like
The K-S statistic, float between 0 and 1
cdf : bool, optional
whether to compute the CDF(default=true) or the SF.
Returns
-------
cdf : ndarray
CDF (or SF it cdf is False) at the specified locations.
The return value has shape the result of numpy broadcasting n and x.
"""
it = np.nditer([n, x, cdf, None],
op_dtypes=[None, np.float64, np.bool_, np.float64])
for _n, _x, _cdf, z in it:
if np.isnan(_n):
z[...] = _n
continue
if int(_n) != _n:
raise ValueError(f'n is not integral: {_n}')
z[...] = _kolmogn(int(_n), _x, cdf=_cdf)
result = it.operands[-1]
return result
def kolmognp(n, x):
"""Computes the PDF for the two-sided Kolmogorov-Smirnov distribution.
Parameters
----------
n : integer, array_like
the number of samples
x : float, array_like
The K-S statistic, float between 0 and 1
Returns
-------
pdf : ndarray
The PDF at the specified locations
The return value has shape the result of numpy broadcasting n and x.
"""
it = np.nditer([n, x, None])
for _n, _x, z in it:
if np.isnan(_n):
z[...] = _n
continue
if int(_n) != _n:
raise ValueError(f'n is not integral: {_n}')
z[...] = _kolmogn_p(int(_n), _x)
result = it.operands[-1]
return result
def kolmogni(n, q, cdf=True):
"""Computes the PPF(or ISF) for the two-sided Kolmogorov-Smirnov distribution.
Parameters
----------
n : integer, array_like
the number of samples
q : float, array_like
Probabilities, float between 0 and 1
cdf : bool, optional
whether to compute the PPF(default=true) or the ISF.
Returns
-------
ppf : ndarray
PPF (or ISF if cdf is False) at the specified locations
The return value has shape the result of numpy broadcasting n and x.
"""
it = np.nditer([n, q, cdf, None])
for _n, _q, _cdf, z in it:
if np.isnan(_n):
z[...] = _n
continue
if int(_n) != _n:
raise ValueError(f'n is not integral: {_n}')
_pcdf, _psf = (_q, 1-_q) if _cdf else (1-_q, _q)
z[...] = _kolmogni(int(_n), _pcdf, _psf)
result = it.operands[-1]
return result

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,494 @@
import numpy as np
from collections import namedtuple
from scipy import special
from scipy import stats
from scipy.stats._stats_py import _rankdata
from ._axis_nan_policy import _axis_nan_policy_factory
def _broadcast_concatenate(x, y, axis):
'''Broadcast then concatenate arrays, leaving concatenation axis last'''
x = np.moveaxis(x, axis, -1)
y = np.moveaxis(y, axis, -1)
z = np.broadcast(x[..., 0], y[..., 0])
x = np.broadcast_to(x, z.shape + (x.shape[-1],))
y = np.broadcast_to(y, z.shape + (y.shape[-1],))
z = np.concatenate((x, y), axis=-1)
return x, y, z
class _MWU:
'''Distribution of MWU statistic under the null hypothesis'''
def __init__(self, n1, n2):
self._reset(n1, n2)
def set_shapes(self, n1, n2):
n1, n2 = min(n1, n2), max(n1, n2)
if (n1, n2) == (self.n1, self.n2):
return
self.n1 = n1
self.n2 = n2
self.s_array = np.zeros(0, dtype=int)
self.configurations = np.zeros(0, dtype=np.uint64)
def reset(self):
self._reset(self.n1, self.n2)
def _reset(self, n1, n2):
self.n1 = None
self.n2 = None
self.set_shapes(n1, n2)
def pmf(self, k):
# In practice, `pmf` is never called with k > m*n/2.
# If it were, we'd exploit symmetry here:
# k = np.array(k, copy=True)
# k2 = m*n - k
# i = k2 < k
# k[i] = k2[i]
pmfs = self.build_u_freqs_array(np.max(k))
return pmfs[k]
def cdf(self, k):
'''Cumulative distribution function'''
# In practice, `cdf` is never called with k > m*n/2.
# If it were, we'd exploit symmetry here rather than in `sf`
pmfs = self.build_u_freqs_array(np.max(k))
cdfs = np.cumsum(pmfs)
return cdfs[k]
def sf(self, k):
'''Survival function'''
# Note that both CDF and SF include the PMF at k. The p-value is
# calculated from the SF and should include the mass at k, so this
# is desirable
# Use the fact that the distribution is symmetric and sum from the left
kc = np.asarray(self.n1*self.n2 - k) # complement of k
i = k < kc
if np.any(i):
kc[i] = k[i]
cdfs = np.asarray(self.cdf(kc))
cdfs[i] = 1. - cdfs[i] + self.pmf(kc[i])
else:
cdfs = np.asarray(self.cdf(kc))
return cdfs[()]
# build_sigma_array and build_u_freqs_array adapted from code
# by @toobaz with permission. Thanks to @andreasloe for the suggestion.
# See https://github.com/scipy/scipy/pull/4933#issuecomment-1898082691
def build_sigma_array(self, a):
n1, n2 = self.n1, self.n2
if a + 1 <= self.s_array.size:
return self.s_array[1:a+1]
s_array = np.zeros(a + 1, dtype=int)
for d in np.arange(1, n1 + 1):
# All multiples of d, except 0:
indices = np.arange(d, a + 1, d)
# \epsilon_d = 1:
s_array[indices] += d
for d in np.arange(n2 + 1, n2 + n1 + 1):
# All multiples of d, except 0:
indices = np.arange(d, a + 1, d)
# \epsilon_d = -1:
s_array[indices] -= d
# We don't need 0:
self.s_array = s_array
return s_array[1:]
def build_u_freqs_array(self, maxu):
"""
Build all the array of frequencies for u from 0 to maxu.
Assumptions:
n1 <= n2
maxu <= n1 * n2 / 2
"""
n1, n2 = self.n1, self.n2
total = special.binom(n1 + n2, n1)
if maxu + 1 <= self.configurations.size:
return self.configurations[:maxu + 1] / total
s_array = self.build_sigma_array(maxu)
# Start working with ints, for maximum precision and efficiency:
configurations = np.zeros(maxu + 1, dtype=np.uint64)
configurations_is_uint = True
uint_max = np.iinfo(np.uint64).max
# How many ways to have U=0? 1
configurations[0] = 1
for u in np.arange(1, maxu + 1):
coeffs = s_array[u - 1::-1]
new_val = np.dot(configurations[:u], coeffs) / u
if new_val > uint_max and configurations_is_uint:
# OK, we got into numbers too big for uint64.
# So now we start working with floats.
# By doing this since the beginning, we would have lost precision.
# (And working on python long ints would be unbearably slow)
configurations = configurations.astype(float)
configurations_is_uint = False
configurations[u] = new_val
self.configurations = configurations
return configurations / total
_mwu_state = _MWU(0, 0)
def _get_mwu_z(U, n1, n2, t, axis=0, continuity=True):
'''Standardized MWU statistic'''
# Follows mannwhitneyu [2]
mu = n1 * n2 / 2
n = n1 + n2
# Tie correction according to [2], "Normal approximation and tie correction"
# "A more computationally-efficient form..."
tie_term = (t**3 - t).sum(axis=-1)
s = np.sqrt(n1*n2/12 * ((n + 1) - tie_term/(n*(n-1))))
numerator = U - mu
# Continuity correction.
# Because SF is always used to calculate the p-value, we can always
# _subtract_ 0.5 for the continuity correction. This always increases the
# p-value to account for the rest of the probability mass _at_ q = U.
if continuity:
numerator -= 0.5
# no problem evaluating the norm SF at an infinity
with np.errstate(divide='ignore', invalid='ignore'):
z = numerator / s
return z
def _mwu_input_validation(x, y, use_continuity, alternative, axis, method):
''' Input validation and standardization for mannwhitneyu '''
# Would use np.asarray_chkfinite, but infs are OK
x, y = np.atleast_1d(x), np.atleast_1d(y)
if np.isnan(x).any() or np.isnan(y).any():
raise ValueError('`x` and `y` must not contain NaNs.')
if np.size(x) == 0 or np.size(y) == 0:
raise ValueError('`x` and `y` must be of nonzero size.')
bools = {True, False}
if use_continuity not in bools:
raise ValueError(f'`use_continuity` must be one of {bools}.')
alternatives = {"two-sided", "less", "greater"}
alternative = alternative.lower()
if alternative not in alternatives:
raise ValueError(f'`alternative` must be one of {alternatives}.')
axis_int = int(axis)
if axis != axis_int:
raise ValueError('`axis` must be an integer.')
if not isinstance(method, stats.PermutationMethod):
methods = {"asymptotic", "exact", "auto"}
method = method.lower()
if method not in methods:
raise ValueError(f'`method` must be one of {methods}.')
return x, y, use_continuity, alternative, axis_int, method
def _mwu_choose_method(n1, n2, ties):
"""Choose method 'asymptotic' or 'exact' depending on input size, ties"""
# if both inputs are large, asymptotic is OK
if n1 > 8 and n2 > 8:
return "asymptotic"
# if there are any ties, asymptotic is preferred
if ties:
return "asymptotic"
return "exact"
MannwhitneyuResult = namedtuple('MannwhitneyuResult', ('statistic', 'pvalue'))
@_axis_nan_policy_factory(MannwhitneyuResult, n_samples=2)
def mannwhitneyu(x, y, use_continuity=True, alternative="two-sided",
axis=0, method="auto"):
r'''Perform the Mann-Whitney U rank test on two independent samples.
The Mann-Whitney U test is a nonparametric test of the null hypothesis
that the distribution underlying sample `x` is the same as the
distribution underlying sample `y`. It is often used as a test of
difference in location between distributions.
Parameters
----------
x, y : array-like
N-d arrays of samples. The arrays must be broadcastable except along
the dimension given by `axis`.
use_continuity : bool, optional
Whether a continuity correction (1/2) should be applied.
Default is True when `method` is ``'asymptotic'``; has no effect
otherwise.
alternative : {'two-sided', 'less', 'greater'}, optional
Defines the alternative hypothesis. Default is 'two-sided'.
Let *F(u)* and *G(u)* be the cumulative distribution functions of the
distributions underlying `x` and `y`, respectively. Then the following
alternative hypotheses are available:
* 'two-sided': the distributions are not equal, i.e. *F(u) ≠ G(u)* for
at least one *u*.
* 'less': the distribution underlying `x` is stochastically less
than the distribution underlying `y`, i.e. *F(u) > G(u)* for all *u*.
* 'greater': the distribution underlying `x` is stochastically greater
than the distribution underlying `y`, i.e. *F(u) < G(u)* for all *u*.
Note that the mathematical expressions in the alternative hypotheses
above describe the CDFs of the underlying distributions. The directions
of the inequalities appear inconsistent with the natural language
description at first glance, but they are not. For example, suppose
*X* and *Y* are random variables that follow distributions with CDFs
*F* and *G*, respectively. If *F(u) > G(u)* for all *u*, samples drawn
from *X* tend to be less than those drawn from *Y*.
Under a more restrictive set of assumptions, the alternative hypotheses
can be expressed in terms of the locations of the distributions;
see [5] section 5.1.
axis : int, optional
Axis along which to perform the test. Default is 0.
method : {'auto', 'asymptotic', 'exact'} or `PermutationMethod` instance, optional
Selects the method used to calculate the *p*-value.
Default is 'auto'. The following options are available.
* ``'asymptotic'``: compares the standardized test statistic
against the normal distribution, correcting for ties.
* ``'exact'``: computes the exact *p*-value by comparing the observed
:math:`U` statistic against the exact distribution of the :math:`U`
statistic under the null hypothesis. No correction is made for ties.
* ``'auto'``: chooses ``'exact'`` when the size of one of the samples
is less than or equal to 8 and there are no ties;
chooses ``'asymptotic'`` otherwise.
* `PermutationMethod` instance. In this case, the p-value
is computed using `permutation_test` with the provided
configuration options and other appropriate settings.
Returns
-------
res : MannwhitneyuResult
An object containing attributes:
statistic : float
The Mann-Whitney U statistic corresponding with sample `x`. See
Notes for the test statistic corresponding with sample `y`.
pvalue : float
The associated *p*-value for the chosen `alternative`.
Notes
-----
If ``U1`` is the statistic corresponding with sample `x`, then the
statistic corresponding with sample `y` is
``U2 = x.shape[axis] * y.shape[axis] - U1``.
`mannwhitneyu` is for independent samples. For related / paired samples,
consider `scipy.stats.wilcoxon`.
`method` ``'exact'`` is recommended when there are no ties and when either
sample size is less than 8 [1]_. The implementation follows the algorithm
reported in [3]_.
Note that the exact method is *not* corrected for ties, but
`mannwhitneyu` will not raise errors or warnings if there are ties in the
data. If there are ties and either samples is small (fewer than ~10
observations), consider passing an instance of `PermutationMethod`
as the `method` to perform a permutation test.
The Mann-Whitney U test is a non-parametric version of the t-test for
independent samples. When the means of samples from the populations
are normally distributed, consider `scipy.stats.ttest_ind`.
See Also
--------
scipy.stats.wilcoxon, scipy.stats.ranksums, scipy.stats.ttest_ind
References
----------
.. [1] H.B. Mann and D.R. Whitney, "On a test of whether one of two random
variables is stochastically larger than the other", The Annals of
Mathematical Statistics, Vol. 18, pp. 50-60, 1947.
.. [2] Mann-Whitney U Test, Wikipedia,
http://en.wikipedia.org/wiki/Mann-Whitney_U_test
.. [3] Andreas Löffler,
"Über eine Partition der nat. Zahlen und ihr Anwendung beim U-Test",
Wiss. Z. Univ. Halle, XXXII'83 pp. 87-89.
.. [4] Rosie Shier, "Statistics: 2.3 The Mann-Whitney U Test", Mathematics
Learning Support Centre, 2004.
.. [5] Michael P. Fay and Michael A. Proschan. "Wilcoxon-Mann-Whitney
or t-test? On assumptions for hypothesis tests and multiple \
interpretations of decision rules." Statistics surveys, Vol. 4, pp.
1-39, 2010. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2857732/
Examples
--------
We follow the example from [4]_: nine randomly sampled young adults were
diagnosed with type II diabetes at the ages below.
>>> males = [19, 22, 16, 29, 24]
>>> females = [20, 11, 17, 12]
We use the Mann-Whitney U test to assess whether there is a statistically
significant difference in the diagnosis age of males and females.
The null hypothesis is that the distribution of male diagnosis ages is
the same as the distribution of female diagnosis ages. We decide
that a confidence level of 95% is required to reject the null hypothesis
in favor of the alternative that the distributions are different.
Since the number of samples is very small and there are no ties in the
data, we can compare the observed test statistic against the *exact*
distribution of the test statistic under the null hypothesis.
>>> from scipy.stats import mannwhitneyu
>>> U1, p = mannwhitneyu(males, females, method="exact")
>>> print(U1)
17.0
`mannwhitneyu` always reports the statistic associated with the first
sample, which, in this case, is males. This agrees with :math:`U_M = 17`
reported in [4]_. The statistic associated with the second statistic
can be calculated:
>>> nx, ny = len(males), len(females)
>>> U2 = nx*ny - U1
>>> print(U2)
3.0
This agrees with :math:`U_F = 3` reported in [4]_. The two-sided
*p*-value can be calculated from either statistic, and the value produced
by `mannwhitneyu` agrees with :math:`p = 0.11` reported in [4]_.
>>> print(p)
0.1111111111111111
The exact distribution of the test statistic is asymptotically normal, so
the example continues by comparing the exact *p*-value against the
*p*-value produced using the normal approximation.
>>> _, pnorm = mannwhitneyu(males, females, method="asymptotic")
>>> print(pnorm)
0.11134688653314041
Here `mannwhitneyu`'s reported *p*-value appears to conflict with the
value :math:`p = 0.09` given in [4]_. The reason is that [4]_
does not apply the continuity correction performed by `mannwhitneyu`;
`mannwhitneyu` reduces the distance between the test statistic and the
mean :math:`\mu = n_x n_y / 2` by 0.5 to correct for the fact that the
discrete statistic is being compared against a continuous distribution.
Here, the :math:`U` statistic used is less than the mean, so we reduce
the distance by adding 0.5 in the numerator.
>>> import numpy as np
>>> from scipy.stats import norm
>>> U = min(U1, U2)
>>> N = nx + ny
>>> z = (U - nx*ny/2 + 0.5) / np.sqrt(nx*ny * (N + 1)/ 12)
>>> p = 2 * norm.cdf(z) # use CDF to get p-value from smaller statistic
>>> print(p)
0.11134688653314041
If desired, we can disable the continuity correction to get a result
that agrees with that reported in [4]_.
>>> _, pnorm = mannwhitneyu(males, females, use_continuity=False,
... method="asymptotic")
>>> print(pnorm)
0.0864107329737
Regardless of whether we perform an exact or asymptotic test, the
probability of the test statistic being as extreme or more extreme by
chance exceeds 5%, so we do not consider the results statistically
significant.
Suppose that, before seeing the data, we had hypothesized that females
would tend to be diagnosed at a younger age than males.
In that case, it would be natural to provide the female ages as the
first input, and we would have performed a one-sided test using
``alternative = 'less'``: females are diagnosed at an age that is
stochastically less than that of males.
>>> res = mannwhitneyu(females, males, alternative="less", method="exact")
>>> print(res)
MannwhitneyuResult(statistic=3.0, pvalue=0.05555555555555555)
Again, the probability of getting a sufficiently low value of the
test statistic by chance under the null hypothesis is greater than 5%,
so we do not reject the null hypothesis in favor of our alternative.
If it is reasonable to assume that the means of samples from the
populations are normally distributed, we could have used a t-test to
perform the analysis.
>>> from scipy.stats import ttest_ind
>>> res = ttest_ind(females, males, alternative="less")
>>> print(res)
TtestResult(statistic=-2.239334696520584,
pvalue=0.030068441095757924,
df=7.0)
Under this assumption, the *p*-value would be low enough to reject the
null hypothesis in favor of the alternative.
'''
x, y, use_continuity, alternative, axis_int, method = (
_mwu_input_validation(x, y, use_continuity, alternative, axis, method))
x, y, xy = _broadcast_concatenate(x, y, axis)
n1, n2 = x.shape[-1], y.shape[-1]
# Follows [2]
ranks, t = _rankdata(xy, 'average', return_ties=True) # method 2, step 1
R1 = ranks[..., :n1].sum(axis=-1) # method 2, step 2
U1 = R1 - n1*(n1+1)/2 # method 2, step 3
U2 = n1 * n2 - U1 # as U1 + U2 = n1 * n2
if alternative == "greater":
U, f = U1, 1 # U is the statistic to use for p-value, f is a factor
elif alternative == "less":
U, f = U2, 1 # Due to symmetry, use SF of U2 rather than CDF of U1
else:
U, f = np.maximum(U1, U2), 2 # multiply SF by two for two-sided test
if method == "auto":
method = _mwu_choose_method(n1, n2, np.any(t > 1))
if method == "exact":
_mwu_state.set_shapes(n1, n2)
p = _mwu_state.sf(U.astype(int))
elif method == "asymptotic":
z = _get_mwu_z(U, n1, n2, t, continuity=use_continuity)
p = stats.norm.sf(z)
else: # `PermutationMethod` instance (already validated)
def statistic(x, y, axis):
return mannwhitneyu(x, y, use_continuity=use_continuity,
alternative=alternative, axis=axis,
method="asymptotic").statistic
res = stats.permutation_test((x, y), statistic, axis=axis,
**method._asdict(), alternative=alternative)
p = res.pvalue
f = 1
p *= f
# Ensure that test statistic is not greater than 1
# This could happen for exact test when U = m*n/2
p = np.clip(p, 0, 1)
return MannwhitneyuResult(U1, p)

View File

@ -0,0 +1,550 @@
import warnings
import numpy as np
from scipy._lib._util import check_random_state, MapWrapper, rng_integers, _contains_nan
from scipy._lib._bunch import _make_tuple_bunch
from scipy.spatial.distance import cdist
from scipy.ndimage import _measurements
from ._stats import _local_correlations # type: ignore[import-not-found]
from . import distributions
__all__ = ['multiscale_graphcorr']
# FROM MGCPY: https://github.com/neurodata/mgcpy
class _ParallelP:
"""Helper function to calculate parallel p-value."""
def __init__(self, x, y, random_states):
self.x = x
self.y = y
self.random_states = random_states
def __call__(self, index):
order = self.random_states[index].permutation(self.y.shape[0])
permy = self.y[order][:, order]
# calculate permuted stats, store in null distribution
perm_stat = _mgc_stat(self.x, permy)[0]
return perm_stat
def _perm_test(x, y, stat, reps=1000, workers=-1, random_state=None):
r"""Helper function that calculates the p-value. See below for uses.
Parameters
----------
x, y : ndarray
`x` and `y` have shapes `(n, p)` and `(n, q)`.
stat : float
The sample test statistic.
reps : int, optional
The number of replications used to estimate the null when using the
permutation test. The default is 1000 replications.
workers : int or map-like callable, optional
If `workers` is an int the population is subdivided into `workers`
sections and evaluated in parallel (uses
`multiprocessing.Pool <multiprocessing>`). Supply `-1` to use all cores
available to the Process. Alternatively supply a map-like callable,
such as `multiprocessing.Pool.map` for evaluating the population in
parallel. This evaluation is carried out as `workers(func, iterable)`.
Requires that `func` be pickleable.
random_state : {None, int, `numpy.random.Generator`,
`numpy.random.RandomState`}, optional
If `seed` is None (or `np.random`), the `numpy.random.RandomState`
singleton is used.
If `seed` is an int, a new ``RandomState`` instance is used,
seeded with `seed`.
If `seed` is already a ``Generator`` or ``RandomState`` instance then
that instance is used.
Returns
-------
pvalue : float
The sample test p-value.
null_dist : list
The approximated null distribution.
"""
# generate seeds for each rep (change to new parallel random number
# capabilities in numpy >= 1.17+)
random_state = check_random_state(random_state)
random_states = [np.random.RandomState(rng_integers(random_state, 1 << 32,
size=4, dtype=np.uint32)) for _ in range(reps)]
# parallelizes with specified workers over number of reps and set seeds
parallelp = _ParallelP(x=x, y=y, random_states=random_states)
with MapWrapper(workers) as mapwrapper:
null_dist = np.array(list(mapwrapper(parallelp, range(reps))))
# calculate p-value and significant permutation map through list
pvalue = (1 + (null_dist >= stat).sum()) / (1 + reps)
return pvalue, null_dist
def _euclidean_dist(x):
return cdist(x, x)
MGCResult = _make_tuple_bunch('MGCResult',
['statistic', 'pvalue', 'mgc_dict'], [])
def multiscale_graphcorr(x, y, compute_distance=_euclidean_dist, reps=1000,
workers=1, is_twosamp=False, random_state=None):
r"""Computes the Multiscale Graph Correlation (MGC) test statistic.
Specifically, for each point, MGC finds the :math:`k`-nearest neighbors for
one property (e.g. cloud density), and the :math:`l`-nearest neighbors for
the other property (e.g. grass wetness) [1]_. This pair :math:`(k, l)` is
called the "scale". A priori, however, it is not know which scales will be
most informative. So, MGC computes all distance pairs, and then efficiently
computes the distance correlations for all scales. The local correlations
illustrate which scales are relatively informative about the relationship.
The key, therefore, to successfully discover and decipher relationships
between disparate data modalities is to adaptively determine which scales
are the most informative, and the geometric implication for the most
informative scales. Doing so not only provides an estimate of whether the
modalities are related, but also provides insight into how the
determination was made. This is especially important in high-dimensional
data, where simple visualizations do not reveal relationships to the
unaided human eye. Characterizations of this implementation in particular
have been derived from and benchmarked within in [2]_.
Parameters
----------
x, y : ndarray
If ``x`` and ``y`` have shapes ``(n, p)`` and ``(n, q)`` where `n` is
the number of samples and `p` and `q` are the number of dimensions,
then the MGC independence test will be run. Alternatively, ``x`` and
``y`` can have shapes ``(n, n)`` if they are distance or similarity
matrices, and ``compute_distance`` must be sent to ``None``. If ``x``
and ``y`` have shapes ``(n, p)`` and ``(m, p)``, an unpaired
two-sample MGC test will be run.
compute_distance : callable, optional
A function that computes the distance or similarity among the samples
within each data matrix. Set to ``None`` if ``x`` and ``y`` are
already distance matrices. The default uses the euclidean norm metric.
If you are calling a custom function, either create the distance
matrix before-hand or create a function of the form
``compute_distance(x)`` where `x` is the data matrix for which
pairwise distances are calculated.
reps : int, optional
The number of replications used to estimate the null when using the
permutation test. The default is ``1000``.
workers : int or map-like callable, optional
If ``workers`` is an int the population is subdivided into ``workers``
sections and evaluated in parallel (uses ``multiprocessing.Pool
<multiprocessing>``). Supply ``-1`` to use all cores available to the
Process. Alternatively supply a map-like callable, such as
``multiprocessing.Pool.map`` for evaluating the p-value in parallel.
This evaluation is carried out as ``workers(func, iterable)``.
Requires that `func` be pickleable. The default is ``1``.
is_twosamp : bool, optional
If `True`, a two sample test will be run. If ``x`` and ``y`` have
shapes ``(n, p)`` and ``(m, p)``, this optional will be overridden and
set to ``True``. Set to ``True`` if ``x`` and ``y`` both have shapes
``(n, p)`` and a two sample test is desired. The default is ``False``.
Note that this will not run if inputs are distance matrices.
random_state : {None, int, `numpy.random.Generator`,
`numpy.random.RandomState`}, optional
If `seed` is None (or `np.random`), the `numpy.random.RandomState`
singleton is used.
If `seed` is an int, a new ``RandomState`` instance is used,
seeded with `seed`.
If `seed` is already a ``Generator`` or ``RandomState`` instance then
that instance is used.
Returns
-------
res : MGCResult
An object containing attributes:
statistic : float
The sample MGC test statistic within `[-1, 1]`.
pvalue : float
The p-value obtained via permutation.
mgc_dict : dict
Contains additional useful results:
- mgc_map : ndarray
A 2D representation of the latent geometry of the
relationship.
- opt_scale : (int, int)
The estimated optimal scale as a `(x, y)` pair.
- null_dist : list
The null distribution derived from the permuted matrices.
See Also
--------
pearsonr : Pearson correlation coefficient and p-value for testing
non-correlation.
kendalltau : Calculates Kendall's tau.
spearmanr : Calculates a Spearman rank-order correlation coefficient.
Notes
-----
A description of the process of MGC and applications on neuroscience data
can be found in [1]_. It is performed using the following steps:
#. Two distance matrices :math:`D^X` and :math:`D^Y` are computed and
modified to be mean zero columnwise. This results in two
:math:`n \times n` distance matrices :math:`A` and :math:`B` (the
centering and unbiased modification) [3]_.
#. For all values :math:`k` and :math:`l` from :math:`1, ..., n`,
* The :math:`k`-nearest neighbor and :math:`l`-nearest neighbor graphs
are calculated for each property. Here, :math:`G_k (i, j)` indicates
the :math:`k`-smallest values of the :math:`i`-th row of :math:`A`
and :math:`H_l (i, j)` indicates the :math:`l` smallested values of
the :math:`i`-th row of :math:`B`
* Let :math:`\circ` denotes the entry-wise matrix product, then local
correlations are summed and normalized using the following statistic:
.. math::
c^{kl} = \frac{\sum_{ij} A G_k B H_l}
{\sqrt{\sum_{ij} A^2 G_k \times \sum_{ij} B^2 H_l}}
#. The MGC test statistic is the smoothed optimal local correlation of
:math:`\{ c^{kl} \}`. Denote the smoothing operation as :math:`R(\cdot)`
(which essentially set all isolated large correlations) as 0 and
connected large correlations the same as before, see [3]_.) MGC is,
.. math::
MGC_n (x, y) = \max_{(k, l)} R \left(c^{kl} \left( x_n, y_n \right)
\right)
The test statistic returns a value between :math:`(-1, 1)` since it is
normalized.
The p-value returned is calculated using a permutation test. This process
is completed by first randomly permuting :math:`y` to estimate the null
distribution and then calculating the probability of observing a test
statistic, under the null, at least as extreme as the observed test
statistic.
MGC requires at least 5 samples to run with reliable results. It can also
handle high-dimensional data sets.
In addition, by manipulating the input data matrices, the two-sample
testing problem can be reduced to the independence testing problem [4]_.
Given sample data :math:`U` and :math:`V` of sizes :math:`p \times n`
:math:`p \times m`, data matrix :math:`X` and :math:`Y` can be created as
follows:
.. math::
X = [U | V] \in \mathcal{R}^{p \times (n + m)}
Y = [0_{1 \times n} | 1_{1 \times m}] \in \mathcal{R}^{(n + m)}
Then, the MGC statistic can be calculated as normal. This methodology can
be extended to similar tests such as distance correlation [4]_.
.. versionadded:: 1.4.0
References
----------
.. [1] Vogelstein, J. T., Bridgeford, E. W., Wang, Q., Priebe, C. E.,
Maggioni, M., & Shen, C. (2019). Discovering and deciphering
relationships across disparate data modalities. ELife.
.. [2] Panda, S., Palaniappan, S., Xiong, J., Swaminathan, A.,
Ramachandran, S., Bridgeford, E. W., ... Vogelstein, J. T. (2019).
mgcpy: A Comprehensive High Dimensional Independence Testing Python
Package. :arXiv:`1907.02088`
.. [3] Shen, C., Priebe, C.E., & Vogelstein, J. T. (2019). From distance
correlation to multiscale graph correlation. Journal of the American
Statistical Association.
.. [4] Shen, C. & Vogelstein, J. T. (2018). The Exact Equivalence of
Distance and Kernel Methods for Hypothesis Testing.
:arXiv:`1806.05514`
Examples
--------
>>> import numpy as np
>>> from scipy.stats import multiscale_graphcorr
>>> x = np.arange(100)
>>> y = x
>>> res = multiscale_graphcorr(x, y)
>>> res.statistic, res.pvalue
(1.0, 0.001)
To run an unpaired two-sample test,
>>> x = np.arange(100)
>>> y = np.arange(79)
>>> res = multiscale_graphcorr(x, y)
>>> res.statistic, res.pvalue # doctest: +SKIP
(0.033258146255703246, 0.023)
or, if shape of the inputs are the same,
>>> x = np.arange(100)
>>> y = x
>>> res = multiscale_graphcorr(x, y, is_twosamp=True)
>>> res.statistic, res.pvalue # doctest: +SKIP
(-0.008021809890200488, 1.0)
"""
if not isinstance(x, np.ndarray) or not isinstance(y, np.ndarray):
raise ValueError("x and y must be ndarrays")
# convert arrays of type (n,) to (n, 1)
if x.ndim == 1:
x = x[:, np.newaxis]
elif x.ndim != 2:
raise ValueError(f"Expected a 2-D array `x`, found shape {x.shape}")
if y.ndim == 1:
y = y[:, np.newaxis]
elif y.ndim != 2:
raise ValueError(f"Expected a 2-D array `y`, found shape {y.shape}")
nx, px = x.shape
ny, py = y.shape
# check for NaNs
_contains_nan(x, nan_policy='raise')
_contains_nan(y, nan_policy='raise')
# check for positive or negative infinity and raise error
if np.sum(np.isinf(x)) > 0 or np.sum(np.isinf(y)) > 0:
raise ValueError("Inputs contain infinities")
if nx != ny:
if px == py:
# reshape x and y for two sample testing
is_twosamp = True
else:
raise ValueError("Shape mismatch, x and y must have shape [n, p] "
"and [n, q] or have shape [n, p] and [m, p].")
if nx < 5 or ny < 5:
raise ValueError("MGC requires at least 5 samples to give reasonable "
"results.")
# convert x and y to float
x = x.astype(np.float64)
y = y.astype(np.float64)
# check if compute_distance_matrix if a callable()
if not callable(compute_distance) and compute_distance is not None:
raise ValueError("Compute_distance must be a function.")
# check if number of reps exists, integer, or > 0 (if under 1000 raises
# warning)
if not isinstance(reps, int) or reps < 0:
raise ValueError("Number of reps must be an integer greater than 0.")
elif reps < 1000:
msg = ("The number of replications is low (under 1000), and p-value "
"calculations may be unreliable. Use the p-value result, with "
"caution!")
warnings.warn(msg, RuntimeWarning, stacklevel=2)
if is_twosamp:
if compute_distance is None:
raise ValueError("Cannot run if inputs are distance matrices")
x, y = _two_sample_transform(x, y)
if compute_distance is not None:
# compute distance matrices for x and y
x = compute_distance(x)
y = compute_distance(y)
# calculate MGC stat
stat, stat_dict = _mgc_stat(x, y)
stat_mgc_map = stat_dict["stat_mgc_map"]
opt_scale = stat_dict["opt_scale"]
# calculate permutation MGC p-value
pvalue, null_dist = _perm_test(x, y, stat, reps=reps, workers=workers,
random_state=random_state)
# save all stats (other than stat/p-value) in dictionary
mgc_dict = {"mgc_map": stat_mgc_map,
"opt_scale": opt_scale,
"null_dist": null_dist}
# create result object with alias for backward compatibility
res = MGCResult(stat, pvalue, mgc_dict)
res.stat = stat
return res
def _mgc_stat(distx, disty):
r"""Helper function that calculates the MGC stat. See above for use.
Parameters
----------
distx, disty : ndarray
`distx` and `disty` have shapes `(n, p)` and `(n, q)` or
`(n, n)` and `(n, n)`
if distance matrices.
Returns
-------
stat : float
The sample MGC test statistic within `[-1, 1]`.
stat_dict : dict
Contains additional useful additional returns containing the following
keys:
- stat_mgc_map : ndarray
MGC-map of the statistics.
- opt_scale : (float, float)
The estimated optimal scale as a `(x, y)` pair.
"""
# calculate MGC map and optimal scale
stat_mgc_map = _local_correlations(distx, disty, global_corr='mgc')
n, m = stat_mgc_map.shape
if m == 1 or n == 1:
# the global scale at is the statistic calculated at maximial nearest
# neighbors. There is not enough local scale to search over, so
# default to global scale
stat = stat_mgc_map[m - 1][n - 1]
opt_scale = m * n
else:
samp_size = len(distx) - 1
# threshold to find connected region of significant local correlations
sig_connect = _threshold_mgc_map(stat_mgc_map, samp_size)
# maximum within the significant region
stat, opt_scale = _smooth_mgc_map(sig_connect, stat_mgc_map)
stat_dict = {"stat_mgc_map": stat_mgc_map,
"opt_scale": opt_scale}
return stat, stat_dict
def _threshold_mgc_map(stat_mgc_map, samp_size):
r"""
Finds a connected region of significance in the MGC-map by thresholding.
Parameters
----------
stat_mgc_map : ndarray
All local correlations within `[-1,1]`.
samp_size : int
The sample size of original data.
Returns
-------
sig_connect : ndarray
A binary matrix with 1's indicating the significant region.
"""
m, n = stat_mgc_map.shape
# 0.02 is simply an empirical threshold, this can be set to 0.01 or 0.05
# with varying levels of performance. Threshold is based on a beta
# approximation.
per_sig = 1 - (0.02 / samp_size) # Percentile to consider as significant
threshold = samp_size * (samp_size - 3)/4 - 1/2 # Beta approximation
threshold = distributions.beta.ppf(per_sig, threshold, threshold) * 2 - 1
# the global scale at is the statistic calculated at maximial nearest
# neighbors. Threshold is the maximum on the global and local scales
threshold = max(threshold, stat_mgc_map[m - 1][n - 1])
# find the largest connected component of significant correlations
sig_connect = stat_mgc_map > threshold
if np.sum(sig_connect) > 0:
sig_connect, _ = _measurements.label(sig_connect)
_, label_counts = np.unique(sig_connect, return_counts=True)
# skip the first element in label_counts, as it is count(zeros)
max_label = np.argmax(label_counts[1:]) + 1
sig_connect = sig_connect == max_label
else:
sig_connect = np.array([[False]])
return sig_connect
def _smooth_mgc_map(sig_connect, stat_mgc_map):
"""Finds the smoothed maximal within the significant region R.
If area of R is too small it returns the last local correlation. Otherwise,
returns the maximum within significant_connected_region.
Parameters
----------
sig_connect : ndarray
A binary matrix with 1's indicating the significant region.
stat_mgc_map : ndarray
All local correlations within `[-1, 1]`.
Returns
-------
stat : float
The sample MGC statistic within `[-1, 1]`.
opt_scale: (float, float)
The estimated optimal scale as an `(x, y)` pair.
"""
m, n = stat_mgc_map.shape
# the global scale at is the statistic calculated at maximial nearest
# neighbors. By default, statistic and optimal scale are global.
stat = stat_mgc_map[m - 1][n - 1]
opt_scale = [m, n]
if np.linalg.norm(sig_connect) != 0:
# proceed only when the connected region's area is sufficiently large
# 0.02 is simply an empirical threshold, this can be set to 0.01 or 0.05
# with varying levels of performance
if np.sum(sig_connect) >= np.ceil(0.02 * max(m, n)) * min(m, n):
max_corr = max(stat_mgc_map[sig_connect])
# find all scales within significant_connected_region that maximize
# the local correlation
max_corr_index = np.where((stat_mgc_map >= max_corr) & sig_connect)
if max_corr >= stat:
stat = max_corr
k, l = max_corr_index
one_d_indices = k * n + l # 2D to 1D indexing
k = np.max(one_d_indices) // n
l = np.max(one_d_indices) % n
opt_scale = [k+1, l+1] # adding 1s to match R indexing
return stat, opt_scale
def _two_sample_transform(u, v):
"""Helper function that concatenates x and y for two sample MGC stat.
See above for use.
Parameters
----------
u, v : ndarray
`u` and `v` have shapes `(n, p)` and `(m, p)`.
Returns
-------
x : ndarray
Concatenate `u` and `v` along the `axis = 0`. `x` thus has shape
`(2n, p)`.
y : ndarray
Label matrix for `x` where 0 refers to samples that comes from `u` and
1 refers to samples that come from `v`. `y` thus has shape `(2n, 1)`.
"""
nx = u.shape[0]
ny = v.shape[0]
x = np.concatenate([u, v], axis=0)
y = np.concatenate([np.zeros(nx), np.ones(ny)], axis=0).reshape(-1, 1)
return x, y

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,521 @@
"""
Additional statistics functions with support for masked arrays.
"""
# Original author (2007): Pierre GF Gerard-Marchant
__all__ = ['compare_medians_ms',
'hdquantiles', 'hdmedian', 'hdquantiles_sd',
'idealfourths',
'median_cihs','mjci','mquantiles_cimj',
'rsh',
'trimmed_mean_ci',]
import numpy as np
from numpy import float64, ndarray
import numpy.ma as ma
from numpy.ma import MaskedArray
from . import _mstats_basic as mstats
from scipy.stats.distributions import norm, beta, t, binom
def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,):
"""
Computes quantile estimates with the Harrell-Davis method.
The quantile estimates are calculated as a weighted linear combination
of order statistics.
Parameters
----------
data : array_like
Data array.
prob : sequence, optional
Sequence of probabilities at which to compute the quantiles.
axis : int or None, optional
Axis along which to compute the quantiles. If None, use a flattened
array.
var : bool, optional
Whether to return the variance of the estimate.
Returns
-------
hdquantiles : MaskedArray
A (p,) array of quantiles (if `var` is False), or a (2,p) array of
quantiles and variances (if `var` is True), where ``p`` is the
number of quantiles.
See Also
--------
hdquantiles_sd
Examples
--------
>>> import numpy as np
>>> from scipy.stats.mstats import hdquantiles
>>>
>>> # Sample data
>>> data = np.array([1.2, 2.5, 3.7, 4.0, 5.1, 6.3, 7.0, 8.2, 9.4])
>>>
>>> # Probabilities at which to compute quantiles
>>> probabilities = [0.25, 0.5, 0.75]
>>>
>>> # Compute Harrell-Davis quantile estimates
>>> quantile_estimates = hdquantiles(data, prob=probabilities)
>>>
>>> # Display the quantile estimates
>>> for i, quantile in enumerate(probabilities):
... print(f"{int(quantile * 100)}th percentile: {quantile_estimates[i]}")
25th percentile: 3.1505820231763066 # may vary
50th percentile: 5.194344084883956
75th percentile: 7.430626414674935
"""
def _hd_1D(data,prob,var):
"Computes the HD quantiles for a 1D array. Returns nan for invalid data."
xsorted = np.squeeze(np.sort(data.compressed().view(ndarray)))
# Don't use length here, in case we have a numpy scalar
n = xsorted.size
hd = np.empty((2,len(prob)), float64)
if n < 2:
hd.flat = np.nan
if var:
return hd
return hd[0]
v = np.arange(n+1) / float(n)
betacdf = beta.cdf
for (i,p) in enumerate(prob):
_w = betacdf(v, (n+1)*p, (n+1)*(1-p))
w = _w[1:] - _w[:-1]
hd_mean = np.dot(w, xsorted)
hd[0,i] = hd_mean
#
hd[1,i] = np.dot(w, (xsorted-hd_mean)**2)
#
hd[0, prob == 0] = xsorted[0]
hd[0, prob == 1] = xsorted[-1]
if var:
hd[1, prob == 0] = hd[1, prob == 1] = np.nan
return hd
return hd[0]
# Initialization & checks
data = ma.array(data, copy=False, dtype=float64)
p = np.atleast_1d(np.asarray(prob))
# Computes quantiles along axis (or globally)
if (axis is None) or (data.ndim == 1):
result = _hd_1D(data, p, var)
else:
if data.ndim > 2:
raise ValueError("Array 'data' must be at most two dimensional, "
"but got data.ndim = %d" % data.ndim)
result = ma.apply_along_axis(_hd_1D, axis, data, p, var)
return ma.fix_invalid(result, copy=False)
def hdmedian(data, axis=-1, var=False):
"""
Returns the Harrell-Davis estimate of the median along the given axis.
Parameters
----------
data : ndarray
Data array.
axis : int, optional
Axis along which to compute the quantiles. If None, use a flattened
array.
var : bool, optional
Whether to return the variance of the estimate.
Returns
-------
hdmedian : MaskedArray
The median values. If ``var=True``, the variance is returned inside
the masked array. E.g. for a 1-D array the shape change from (1,) to
(2,).
"""
result = hdquantiles(data,[0.5], axis=axis, var=var)
return result.squeeze()
def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None):
"""
The standard error of the Harrell-Davis quantile estimates by jackknife.
Parameters
----------
data : array_like
Data array.
prob : sequence, optional
Sequence of quantiles to compute.
axis : int, optional
Axis along which to compute the quantiles. If None, use a flattened
array.
Returns
-------
hdquantiles_sd : MaskedArray
Standard error of the Harrell-Davis quantile estimates.
See Also
--------
hdquantiles
"""
def _hdsd_1D(data, prob):
"Computes the std error for 1D arrays."
xsorted = np.sort(data.compressed())
n = len(xsorted)
hdsd = np.empty(len(prob), float64)
if n < 2:
hdsd.flat = np.nan
vv = np.arange(n) / float(n-1)
betacdf = beta.cdf
for (i,p) in enumerate(prob):
_w = betacdf(vv, n*p, n*(1-p))
w = _w[1:] - _w[:-1]
# cumulative sum of weights and data points if
# ith point is left out for jackknife
mx_ = np.zeros_like(xsorted)
mx_[1:] = np.cumsum(w * xsorted[:-1])
# similar but from the right
mx_[:-1] += np.cumsum(w[::-1] * xsorted[:0:-1])[::-1]
hdsd[i] = np.sqrt(mx_.var() * (n - 1))
return hdsd
# Initialization & checks
data = ma.array(data, copy=False, dtype=float64)
p = np.atleast_1d(np.asarray(prob))
# Computes quantiles along axis (or globally)
if (axis is None):
result = _hdsd_1D(data, p)
else:
if data.ndim > 2:
raise ValueError("Array 'data' must be at most two dimensional, "
"but got data.ndim = %d" % data.ndim)
result = ma.apply_along_axis(_hdsd_1D, axis, data, p)
return ma.fix_invalid(result, copy=False).ravel()
def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True),
alpha=0.05, axis=None):
"""
Selected confidence interval of the trimmed mean along the given axis.
Parameters
----------
data : array_like
Input data.
limits : {None, tuple}, optional
None or a two item tuple.
Tuple of the percentages to cut on each side of the array, with respect
to the number of unmasked data, as floats between 0. and 1. If ``n``
is the number of unmasked data before trimming, then
(``n * limits[0]``)th smallest data and (``n * limits[1]``)th
largest data are masked. The total number of unmasked data after
trimming is ``n * (1. - sum(limits))``.
The value of one limit can be set to None to indicate an open interval.
Defaults to (0.2, 0.2).
inclusive : (2,) tuple of boolean, optional
If relative==False, tuple indicating whether values exactly equal to
the absolute limits are allowed.
If relative==True, tuple indicating whether the number of data being
masked on each side should be rounded (True) or truncated (False).
Defaults to (True, True).
alpha : float, optional
Confidence level of the intervals.
Defaults to 0.05.
axis : int, optional
Axis along which to cut. If None, uses a flattened version of `data`.
Defaults to None.
Returns
-------
trimmed_mean_ci : (2,) ndarray
The lower and upper confidence intervals of the trimmed data.
"""
data = ma.array(data, copy=False)
trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis)
tmean = trimmed.mean(axis)
tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis)
df = trimmed.count(axis) - 1
tppf = t.ppf(1-alpha/2.,df)
return np.array((tmean - tppf*tstde, tmean+tppf*tstde))
def mjci(data, prob=[0.25,0.5,0.75], axis=None):
"""
Returns the Maritz-Jarrett estimators of the standard error of selected
experimental quantiles of the data.
Parameters
----------
data : ndarray
Data array.
prob : sequence, optional
Sequence of quantiles to compute.
axis : int or None, optional
Axis along which to compute the quantiles. If None, use a flattened
array.
"""
def _mjci_1D(data, p):
data = np.sort(data.compressed())
n = data.size
prob = (np.array(p) * n + 0.5).astype(int)
betacdf = beta.cdf
mj = np.empty(len(prob), float64)
x = np.arange(1,n+1, dtype=float64) / n
y = x - 1./n
for (i,m) in enumerate(prob):
W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m)
C1 = np.dot(W,data)
C2 = np.dot(W,data**2)
mj[i] = np.sqrt(C2 - C1**2)
return mj
data = ma.array(data, copy=False)
if data.ndim > 2:
raise ValueError("Array 'data' must be at most two dimensional, "
"but got data.ndim = %d" % data.ndim)
p = np.atleast_1d(np.asarray(prob))
# Computes quantiles along axis (or globally)
if (axis is None):
return _mjci_1D(data, p)
else:
return ma.apply_along_axis(_mjci_1D, axis, data, p)
def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None):
"""
Computes the alpha confidence interval for the selected quantiles of the
data, with Maritz-Jarrett estimators.
Parameters
----------
data : ndarray
Data array.
prob : sequence, optional
Sequence of quantiles to compute.
alpha : float, optional
Confidence level of the intervals.
axis : int or None, optional
Axis along which to compute the quantiles.
If None, use a flattened array.
Returns
-------
ci_lower : ndarray
The lower boundaries of the confidence interval. Of the same length as
`prob`.
ci_upper : ndarray
The upper boundaries of the confidence interval. Of the same length as
`prob`.
"""
alpha = min(alpha, 1 - alpha)
z = norm.ppf(1 - alpha/2.)
xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
smj = mjci(data, prob, axis=axis)
return (xq - z * smj, xq + z * smj)
def median_cihs(data, alpha=0.05, axis=None):
"""
Computes the alpha-level confidence interval for the median of the data.
Uses the Hettmasperger-Sheather method.
Parameters
----------
data : array_like
Input data. Masked values are discarded. The input should be 1D only,
or `axis` should be set to None.
alpha : float, optional
Confidence level of the intervals.
axis : int or None, optional
Axis along which to compute the quantiles. If None, use a flattened
array.
Returns
-------
median_cihs
Alpha level confidence interval.
"""
def _cihs_1D(data, alpha):
data = np.sort(data.compressed())
n = len(data)
alpha = min(alpha, 1-alpha)
k = int(binom._ppf(alpha/2., n, 0.5))
gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
if gk < 1-alpha:
k -= 1
gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5)
I = (gk - 1 + alpha)/(gk - gkk)
lambd = (n-k) * I / float(k + (n-2*k)*I)
lims = (lambd*data[k] + (1-lambd)*data[k-1],
lambd*data[n-k-1] + (1-lambd)*data[n-k])
return lims
data = ma.array(data, copy=False)
# Computes quantiles along axis (or globally)
if (axis is None):
result = _cihs_1D(data, alpha)
else:
if data.ndim > 2:
raise ValueError("Array 'data' must be at most two dimensional, "
"but got data.ndim = %d" % data.ndim)
result = ma.apply_along_axis(_cihs_1D, axis, data, alpha)
return result
def compare_medians_ms(group_1, group_2, axis=None):
"""
Compares the medians from two independent groups along the given axis.
The comparison is performed using the McKean-Schrader estimate of the
standard error of the medians.
Parameters
----------
group_1 : array_like
First dataset. Has to be of size >=7.
group_2 : array_like
Second dataset. Has to be of size >=7.
axis : int, optional
Axis along which the medians are estimated. If None, the arrays are
flattened. If `axis` is not None, then `group_1` and `group_2`
should have the same shape.
Returns
-------
compare_medians_ms : {float, ndarray}
If `axis` is None, then returns a float, otherwise returns a 1-D
ndarray of floats with a length equal to the length of `group_1`
along `axis`.
Examples
--------
>>> from scipy import stats
>>> a = [1, 2, 3, 4, 5, 6, 7]
>>> b = [8, 9, 10, 11, 12, 13, 14]
>>> stats.mstats.compare_medians_ms(a, b, axis=None)
1.0693225866553746e-05
The function is vectorized to compute along a given axis.
>>> import numpy as np
>>> rng = np.random.default_rng()
>>> x = rng.random(size=(3, 7))
>>> y = rng.random(size=(3, 8))
>>> stats.mstats.compare_medians_ms(x, y, axis=1)
array([0.36908985, 0.36092538, 0.2765313 ])
References
----------
.. [1] McKean, Joseph W., and Ronald M. Schrader. "A comparison of methods
for studentizing the sample median." Communications in
Statistics-Simulation and Computation 13.6 (1984): 751-773.
"""
(med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis))
(std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
mstats.stde_median(group_2, axis=axis))
W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)
return 1 - norm.cdf(W)
def idealfourths(data, axis=None):
"""
Returns an estimate of the lower and upper quartiles.
Uses the ideal fourths algorithm.
Parameters
----------
data : array_like
Input array.
axis : int, optional
Axis along which the quartiles are estimated. If None, the arrays are
flattened.
Returns
-------
idealfourths : {list of floats, masked array}
Returns the two internal values that divide `data` into four parts
using the ideal fourths algorithm either along the flattened array
(if `axis` is None) or along `axis` of `data`.
"""
def _idf(data):
x = data.compressed()
n = len(x)
if n < 3:
return [np.nan,np.nan]
(j,h) = divmod(n/4. + 5/12.,1)
j = int(j)
qlo = (1-h)*x[j-1] + h*x[j]
k = n - j
qup = (1-h)*x[k] + h*x[k-1]
return [qlo, qup]
data = ma.sort(data, axis=axis).view(MaskedArray)
if (axis is None):
return _idf(data)
else:
return ma.apply_along_axis(_idf, axis, data)
def rsh(data, points=None):
"""
Evaluates Rosenblatt's shifted histogram estimators for each data point.
Rosenblatt's estimator is a centered finite-difference approximation to the
derivative of the empirical cumulative distribution function.
Parameters
----------
data : sequence
Input data, should be 1-D. Masked values are ignored.
points : sequence or None, optional
Sequence of points where to evaluate Rosenblatt shifted histogram.
If None, use the data.
"""
data = ma.array(data, copy=False)
if points is None:
points = data
else:
points = np.atleast_1d(np.asarray(points))
if data.ndim != 1:
raise AttributeError("The input array should be 1D only !")
n = data.count()
r = idealfourths(data, axis=None)
h = 1.2 * (r[-1]-r[0]) / n**(1./5)
nhi = (data[:,None] <= points[None,:] + h).sum(0)
nlo = (data[:,None] < points[None,:] - h).sum(0)
return (nhi-nlo) / (2.*n*h)

View File

@ -0,0 +1,459 @@
from __future__ import annotations
import warnings
from dataclasses import dataclass, field
from typing import TYPE_CHECKING
import numpy as np
from scipy import stats
from scipy.optimize import minimize_scalar
from scipy.stats._common import ConfidenceInterval
from scipy.stats._qmc import check_random_state
from scipy.stats._stats_py import _var
if TYPE_CHECKING:
import numpy.typing as npt
from scipy._lib._util import DecimalNumber, SeedType
from typing import Literal, Sequence # noqa: UP035
__all__ = [
'dunnett'
]
@dataclass
class DunnettResult:
"""Result object returned by `scipy.stats.dunnett`.
Attributes
----------
statistic : float ndarray
The computed statistic of the test for each comparison. The element
at index ``i`` is the statistic for the comparison between
groups ``i`` and the control.
pvalue : float ndarray
The computed p-value of the test for each comparison. The element
at index ``i`` is the p-value for the comparison between
group ``i`` and the control.
"""
statistic: np.ndarray
pvalue: np.ndarray
_alternative: Literal['two-sided', 'less', 'greater'] = field(repr=False)
_rho: np.ndarray = field(repr=False)
_df: int = field(repr=False)
_std: float = field(repr=False)
_mean_samples: np.ndarray = field(repr=False)
_mean_control: np.ndarray = field(repr=False)
_n_samples: np.ndarray = field(repr=False)
_n_control: int = field(repr=False)
_rng: SeedType = field(repr=False)
_ci: ConfidenceInterval | None = field(default=None, repr=False)
_ci_cl: DecimalNumber | None = field(default=None, repr=False)
def __str__(self):
# Note: `__str__` prints the confidence intervals from the most
# recent call to `confidence_interval`. If it has not been called,
# it will be called with the default CL of .95.
if self._ci is None:
self.confidence_interval(confidence_level=.95)
s = (
"Dunnett's test"
f" ({self._ci_cl*100:.1f}% Confidence Interval)\n"
"Comparison Statistic p-value Lower CI Upper CI\n"
)
for i in range(self.pvalue.size):
s += (f" (Sample {i} - Control) {self.statistic[i]:>10.3f}"
f"{self.pvalue[i]:>10.3f}"
f"{self._ci.low[i]:>10.3f}"
f"{self._ci.high[i]:>10.3f}\n")
return s
def _allowance(
self, confidence_level: DecimalNumber = 0.95, tol: DecimalNumber = 1e-3
) -> float:
"""Allowance.
It is the quantity to add/subtract from the observed difference
between the means of observed groups and the mean of the control
group. The result gives confidence limits.
Parameters
----------
confidence_level : float, optional
Confidence level for the computed confidence interval.
Default is .95.
tol : float, optional
A tolerance for numerical optimization: the allowance will produce
a confidence within ``10*tol*(1 - confidence_level)`` of the
specified level, or a warning will be emitted. Tight tolerances
may be impractical due to noisy evaluation of the objective.
Default is 1e-3.
Returns
-------
allowance : float
Allowance around the mean.
"""
alpha = 1 - confidence_level
def pvalue_from_stat(statistic):
statistic = np.array(statistic)
sf = _pvalue_dunnett(
rho=self._rho, df=self._df,
statistic=statistic, alternative=self._alternative,
rng=self._rng
)
return abs(sf - alpha)/alpha
# Evaluation of `pvalue_from_stat` is noisy due to the use of RQMC to
# evaluate `multivariate_t.cdf`. `minimize_scalar` is not designed
# to tolerate a noisy objective function and may fail to find the
# minimum accurately. We mitigate this possibility with the validation
# step below, but implementation of a noise-tolerant root finder or
# minimizer would be a welcome enhancement. See gh-18150.
res = minimize_scalar(pvalue_from_stat, method='brent', tol=tol)
critical_value = res.x
# validation
# tol*10 because tol=1e-3 means we tolerate a 1% change at most
if res.success is False or res.fun >= tol*10:
warnings.warn(
"Computation of the confidence interval did not converge to "
"the desired level. The confidence level corresponding with "
f"the returned interval is approximately {alpha*(1+res.fun)}.",
stacklevel=3
)
# From [1] p. 1101 between (1) and (3)
allowance = critical_value*self._std*np.sqrt(
1/self._n_samples + 1/self._n_control
)
return abs(allowance)
def confidence_interval(
self, confidence_level: DecimalNumber = 0.95
) -> ConfidenceInterval:
"""Compute the confidence interval for the specified confidence level.
Parameters
----------
confidence_level : float, optional
Confidence level for the computed confidence interval.
Default is .95.
Returns
-------
ci : ``ConfidenceInterval`` object
The object has attributes ``low`` and ``high`` that hold the
lower and upper bounds of the confidence intervals for each
comparison. The high and low values are accessible for each
comparison at index ``i`` for each group ``i``.
"""
# check to see if the supplied confidence level matches that of the
# previously computed CI.
if (self._ci is not None) and (confidence_level == self._ci_cl):
return self._ci
if not (0 < confidence_level < 1):
raise ValueError("Confidence level must be between 0 and 1.")
allowance = self._allowance(confidence_level=confidence_level)
diff_means = self._mean_samples - self._mean_control
low = diff_means-allowance
high = diff_means+allowance
if self._alternative == 'greater':
high = [np.inf] * len(diff_means)
elif self._alternative == 'less':
low = [-np.inf] * len(diff_means)
self._ci_cl = confidence_level
self._ci = ConfidenceInterval(
low=low,
high=high
)
return self._ci
def dunnett(
*samples: npt.ArrayLike, # noqa: D417
control: npt.ArrayLike,
alternative: Literal['two-sided', 'less', 'greater'] = "two-sided",
random_state: SeedType = None
) -> DunnettResult:
"""Dunnett's test: multiple comparisons of means against a control group.
This is an implementation of Dunnett's original, single-step test as
described in [1]_.
Parameters
----------
sample1, sample2, ... : 1D array_like
The sample measurements for each experimental group.
control : 1D array_like
The sample measurements for the control group.
alternative : {'two-sided', 'less', 'greater'}, optional
Defines the alternative hypothesis.
The null hypothesis is that the means of the distributions underlying
the samples and control are equal. The following alternative
hypotheses are available (default is 'two-sided'):
* 'two-sided': the means of the distributions underlying the samples
and control are unequal.
* 'less': the means of the distributions underlying the samples
are less than the mean of the distribution underlying the control.
* 'greater': the means of the distributions underlying the
samples are greater than the mean of the distribution underlying
the control.
random_state : {None, int, `numpy.random.Generator`}, optional
If `random_state` is an int or None, a new `numpy.random.Generator` is
created using ``np.random.default_rng(random_state)``.
If `random_state` is already a ``Generator`` instance, then the
provided instance is used.
The random number generator is used to control the randomized
Quasi-Monte Carlo integration of the multivariate-t distribution.
Returns
-------
res : `~scipy.stats._result_classes.DunnettResult`
An object containing attributes:
statistic : float ndarray
The computed statistic of the test for each comparison. The element
at index ``i`` is the statistic for the comparison between
groups ``i`` and the control.
pvalue : float ndarray
The computed p-value of the test for each comparison. The element
at index ``i`` is the p-value for the comparison between
group ``i`` and the control.
And the following method:
confidence_interval(confidence_level=0.95) :
Compute the difference in means of the groups
with the control +- the allowance.
See Also
--------
tukey_hsd : performs pairwise comparison of means.
Notes
-----
Like the independent-sample t-test, Dunnett's test [1]_ is used to make
inferences about the means of distributions from which samples were drawn.
However, when multiple t-tests are performed at a fixed significance level,
the "family-wise error rate" - the probability of incorrectly rejecting the
null hypothesis in at least one test - will exceed the significance level.
Dunnett's test is designed to perform multiple comparisons while
controlling the family-wise error rate.
Dunnett's test compares the means of multiple experimental groups
against a single control group. Tukey's Honestly Significant Difference Test
is another multiple-comparison test that controls the family-wise error
rate, but `tukey_hsd` performs *all* pairwise comparisons between groups.
When pairwise comparisons between experimental groups are not needed,
Dunnett's test is preferable due to its higher power.
The use of this test relies on several assumptions.
1. The observations are independent within and among groups.
2. The observations within each group are normally distributed.
3. The distributions from which the samples are drawn have the same finite
variance.
References
----------
.. [1] Charles W. Dunnett. "A Multiple Comparison Procedure for Comparing
Several Treatments with a Control."
Journal of the American Statistical Association, 50:272, 1096-1121,
:doi:`10.1080/01621459.1955.10501294`, 1955.
Examples
--------
In [1]_, the influence of drugs on blood count measurements on three groups
of animal is investigated.
The following table summarizes the results of the experiment in which
two groups received different drugs, and one group acted as a control.
Blood counts (in millions of cells per cubic millimeter) were recorded::
>>> import numpy as np
>>> control = np.array([7.40, 8.50, 7.20, 8.24, 9.84, 8.32])
>>> drug_a = np.array([9.76, 8.80, 7.68, 9.36])
>>> drug_b = np.array([12.80, 9.68, 12.16, 9.20, 10.55])
We would like to see if the means between any of the groups are
significantly different. First, visually examine a box and whisker plot.
>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(1, 1)
>>> ax.boxplot([control, drug_a, drug_b])
>>> ax.set_xticklabels(["Control", "Drug A", "Drug B"]) # doctest: +SKIP
>>> ax.set_ylabel("mean") # doctest: +SKIP
>>> plt.show()
Note the overlapping interquartile ranges of the drug A group and control
group and the apparent separation between the drug B group and control
group.
Next, we will use Dunnett's test to assess whether the difference
between group means is significant while controlling the family-wise error
rate: the probability of making any false discoveries.
Let the null hypothesis be that the experimental groups have the same
mean as the control and the alternative be that an experimental group does
not have the same mean as the control. We will consider a 5% family-wise
error rate to be acceptable, and therefore we choose 0.05 as the threshold
for significance.
>>> from scipy.stats import dunnett
>>> res = dunnett(drug_a, drug_b, control=control)
>>> res.pvalue
array([0.62004941, 0.0059035 ]) # may vary
The p-value corresponding with the comparison between group A and control
exceeds 0.05, so we do not reject the null hypothesis for that comparison.
However, the p-value corresponding with the comparison between group B
and control is less than 0.05, so we consider the experimental results
to be evidence against the null hypothesis in favor of the alternative:
group B has a different mean than the control group.
"""
samples_, control_, rng = _iv_dunnett(
samples=samples, control=control,
alternative=alternative, random_state=random_state
)
rho, df, n_group, n_samples, n_control = _params_dunnett(
samples=samples_, control=control_
)
statistic, std, mean_control, mean_samples = _statistic_dunnett(
samples_, control_, df, n_samples, n_control
)
pvalue = _pvalue_dunnett(
rho=rho, df=df, statistic=statistic, alternative=alternative, rng=rng
)
return DunnettResult(
statistic=statistic, pvalue=pvalue,
_alternative=alternative,
_rho=rho, _df=df, _std=std,
_mean_samples=mean_samples,
_mean_control=mean_control,
_n_samples=n_samples,
_n_control=n_control,
_rng=rng
)
def _iv_dunnett(
samples: Sequence[npt.ArrayLike],
control: npt.ArrayLike,
alternative: Literal['two-sided', 'less', 'greater'],
random_state: SeedType
) -> tuple[list[np.ndarray], np.ndarray, SeedType]:
"""Input validation for Dunnett's test."""
rng = check_random_state(random_state)
if alternative not in {'two-sided', 'less', 'greater'}:
raise ValueError(
"alternative must be 'less', 'greater' or 'two-sided'"
)
ndim_msg = "Control and samples groups must be 1D arrays"
n_obs_msg = "Control and samples groups must have at least 1 observation"
control = np.asarray(control)
samples_ = [np.asarray(sample) for sample in samples]
# samples checks
samples_control: list[np.ndarray] = samples_ + [control]
for sample in samples_control:
if sample.ndim > 1:
raise ValueError(ndim_msg)
if sample.size < 1:
raise ValueError(n_obs_msg)
return samples_, control, rng
def _params_dunnett(
samples: list[np.ndarray], control: np.ndarray
) -> tuple[np.ndarray, int, int, np.ndarray, int]:
"""Specific parameters for Dunnett's test.
Degree of freedom is the number of observations minus the number of groups
including the control.
"""
n_samples = np.array([sample.size for sample in samples])
# From [1] p. 1100 d.f. = (sum N)-(p+1)
n_sample = n_samples.sum()
n_control = control.size
n = n_sample + n_control
n_groups = len(samples)
df = n - n_groups - 1
# From [1] p. 1103 rho_ij = 1/sqrt((N0/Ni+1)(N0/Nj+1))
rho = n_control/n_samples + 1
rho = 1/np.sqrt(rho[:, None] * rho[None, :])
np.fill_diagonal(rho, 1)
return rho, df, n_groups, n_samples, n_control
def _statistic_dunnett(
samples: list[np.ndarray], control: np.ndarray, df: int,
n_samples: np.ndarray, n_control: int
) -> tuple[np.ndarray, float, np.ndarray, np.ndarray]:
"""Statistic of Dunnett's test.
Computation based on the original single-step test from [1].
"""
mean_control = np.mean(control)
mean_samples = np.array([np.mean(sample) for sample in samples])
all_samples = [control] + samples
all_means = np.concatenate([[mean_control], mean_samples])
# Variance estimate s^2 from [1] Eq. 1
s2 = np.sum([_var(sample, mean=mean)*sample.size
for sample, mean in zip(all_samples, all_means)]) / df
std = np.sqrt(s2)
# z score inferred from [1] unlabeled equation after Eq. 1
z = (mean_samples - mean_control) / np.sqrt(1/n_samples + 1/n_control)
return z / std, std, mean_control, mean_samples
def _pvalue_dunnett(
rho: np.ndarray, df: int, statistic: np.ndarray,
alternative: Literal['two-sided', 'less', 'greater'],
rng: SeedType = None
) -> np.ndarray:
"""pvalue from the multivariate t-distribution.
Critical values come from the multivariate student-t distribution.
"""
statistic = statistic.reshape(-1, 1)
mvt = stats.multivariate_t(shape=rho, df=df, seed=rng)
if alternative == "two-sided":
statistic = abs(statistic)
pvalue = 1 - mvt.cdf(statistic, lower_limit=-statistic)
elif alternative == "greater":
pvalue = 1 - mvt.cdf(statistic, lower_limit=-np.inf)
else:
pvalue = 1 - mvt.cdf(np.inf, lower_limit=statistic)
return np.atleast_1d(pvalue)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,482 @@
import numpy as np
from scipy.special import ndtri
from scipy.optimize import brentq
from ._discrete_distns import nchypergeom_fisher
from ._common import ConfidenceInterval
def _sample_odds_ratio(table):
"""
Given a table [[a, b], [c, d]], compute a*d/(b*c).
Return nan if the numerator and denominator are 0.
Return inf if just the denominator is 0.
"""
# table must be a 2x2 numpy array.
if table[1, 0] > 0 and table[0, 1] > 0:
oddsratio = table[0, 0] * table[1, 1] / (table[1, 0] * table[0, 1])
elif table[0, 0] == 0 or table[1, 1] == 0:
oddsratio = np.nan
else:
oddsratio = np.inf
return oddsratio
def _solve(func):
"""
Solve func(nc) = 0. func must be an increasing function.
"""
# We could just as well call the variable `x` instead of `nc`, but we
# always call this function with functions for which nc (the noncentrality
# parameter) is the variable for which we are solving.
nc = 1.0
value = func(nc)
if value == 0:
return nc
# Multiplicative factor by which to increase or decrease nc when
# searching for a bracketing interval.
factor = 2.0
# Find a bracketing interval.
if value > 0:
nc /= factor
while func(nc) > 0:
nc /= factor
lo = nc
hi = factor*nc
else:
nc *= factor
while func(nc) < 0:
nc *= factor
lo = nc/factor
hi = nc
# lo and hi bracket the solution for nc.
nc = brentq(func, lo, hi, xtol=1e-13)
return nc
def _nc_hypergeom_mean_inverse(x, M, n, N):
"""
For the given noncentral hypergeometric parameters x, M, n,and N
(table[0,0], total, row 0 sum and column 0 sum, resp., of a 2x2
contingency table), find the noncentrality parameter of Fisher's
noncentral hypergeometric distribution whose mean is x.
"""
nc = _solve(lambda nc: nchypergeom_fisher.mean(M, n, N, nc) - x)
return nc
def _hypergeom_params_from_table(table):
# The notation M, n and N is consistent with stats.hypergeom and
# stats.nchypergeom_fisher.
x = table[0, 0]
M = table.sum()
n = table[0].sum()
N = table[:, 0].sum()
return x, M, n, N
def _ci_upper(table, alpha):
"""
Compute the upper end of the confidence interval.
"""
if _sample_odds_ratio(table) == np.inf:
return np.inf
x, M, n, N = _hypergeom_params_from_table(table)
# nchypergeom_fisher.cdf is a decreasing function of nc, so we negate
# it in the lambda expression.
nc = _solve(lambda nc: -nchypergeom_fisher.cdf(x, M, n, N, nc) + alpha)
return nc
def _ci_lower(table, alpha):
"""
Compute the lower end of the confidence interval.
"""
if _sample_odds_ratio(table) == 0:
return 0
x, M, n, N = _hypergeom_params_from_table(table)
nc = _solve(lambda nc: nchypergeom_fisher.sf(x - 1, M, n, N, nc) - alpha)
return nc
def _conditional_oddsratio(table):
"""
Conditional MLE of the odds ratio for the 2x2 contingency table.
"""
x, M, n, N = _hypergeom_params_from_table(table)
# Get the bounds of the support. The support of the noncentral
# hypergeometric distribution with parameters M, n, and N is the same
# for all values of the noncentrality parameter, so we can use 1 here.
lo, hi = nchypergeom_fisher.support(M, n, N, 1)
# Check if x is at one of the extremes of the support. If so, we know
# the odds ratio is either 0 or inf.
if x == lo:
# x is at the low end of the support.
return 0
if x == hi:
# x is at the high end of the support.
return np.inf
nc = _nc_hypergeom_mean_inverse(x, M, n, N)
return nc
def _conditional_oddsratio_ci(table, confidence_level=0.95,
alternative='two-sided'):
"""
Conditional exact confidence interval for the odds ratio.
"""
if alternative == 'two-sided':
alpha = 0.5*(1 - confidence_level)
lower = _ci_lower(table, alpha)
upper = _ci_upper(table, alpha)
elif alternative == 'less':
lower = 0.0
upper = _ci_upper(table, 1 - confidence_level)
else:
# alternative == 'greater'
lower = _ci_lower(table, 1 - confidence_level)
upper = np.inf
return lower, upper
def _sample_odds_ratio_ci(table, confidence_level=0.95,
alternative='two-sided'):
oddsratio = _sample_odds_ratio(table)
log_or = np.log(oddsratio)
se = np.sqrt((1/table).sum())
if alternative == 'less':
z = ndtri(confidence_level)
loglow = -np.inf
loghigh = log_or + z*se
elif alternative == 'greater':
z = ndtri(confidence_level)
loglow = log_or - z*se
loghigh = np.inf
else:
# alternative is 'two-sided'
z = ndtri(0.5*confidence_level + 0.5)
loglow = log_or - z*se
loghigh = log_or + z*se
return np.exp(loglow), np.exp(loghigh)
class OddsRatioResult:
"""
Result of `scipy.stats.contingency.odds_ratio`. See the
docstring for `odds_ratio` for more details.
Attributes
----------
statistic : float
The computed odds ratio.
* If `kind` is ``'sample'``, this is sample (or unconditional)
estimate, given by
``table[0, 0]*table[1, 1]/(table[0, 1]*table[1, 0])``.
* If `kind` is ``'conditional'``, this is the conditional
maximum likelihood estimate for the odds ratio. It is
the noncentrality parameter of Fisher's noncentral
hypergeometric distribution with the same hypergeometric
parameters as `table` and whose mean is ``table[0, 0]``.
Methods
-------
confidence_interval :
Confidence interval for the odds ratio.
"""
def __init__(self, _table, _kind, statistic):
# for now, no need to make _table and _kind public, since this sort of
# information is returned in very few `scipy.stats` results
self._table = _table
self._kind = _kind
self.statistic = statistic
def __repr__(self):
return f"OddsRatioResult(statistic={self.statistic})"
def confidence_interval(self, confidence_level=0.95,
alternative='two-sided'):
"""
Confidence interval for the odds ratio.
Parameters
----------
confidence_level: float
Desired confidence level for the confidence interval.
The value must be given as a fraction between 0 and 1.
Default is 0.95 (meaning 95%).
alternative : {'two-sided', 'less', 'greater'}, optional
The alternative hypothesis of the hypothesis test to which the
confidence interval corresponds. That is, suppose the null
hypothesis is that the true odds ratio equals ``OR`` and the
confidence interval is ``(low, high)``. Then the following options
for `alternative` are available (default is 'two-sided'):
* 'two-sided': the true odds ratio is not equal to ``OR``. There
is evidence against the null hypothesis at the chosen
`confidence_level` if ``high < OR`` or ``low > OR``.
* 'less': the true odds ratio is less than ``OR``. The ``low`` end
of the confidence interval is 0, and there is evidence against
the null hypothesis at the chosen `confidence_level` if
``high < OR``.
* 'greater': the true odds ratio is greater than ``OR``. The
``high`` end of the confidence interval is ``np.inf``, and there
is evidence against the null hypothesis at the chosen
`confidence_level` if ``low > OR``.
Returns
-------
ci : ``ConfidenceInterval`` instance
The confidence interval, represented as an object with
attributes ``low`` and ``high``.
Notes
-----
When `kind` is ``'conditional'``, the limits of the confidence
interval are the conditional "exact confidence limits" as described
by Fisher [1]_. The conditional odds ratio and confidence interval are
also discussed in Section 4.1.2 of the text by Sahai and Khurshid [2]_.
When `kind` is ``'sample'``, the confidence interval is computed
under the assumption that the logarithm of the odds ratio is normally
distributed with standard error given by::
se = sqrt(1/a + 1/b + 1/c + 1/d)
where ``a``, ``b``, ``c`` and ``d`` are the elements of the
contingency table. (See, for example, [2]_, section 3.1.3.2,
or [3]_, section 2.3.3).
References
----------
.. [1] R. A. Fisher (1935), The logic of inductive inference,
Journal of the Royal Statistical Society, Vol. 98, No. 1,
pp. 39-82.
.. [2] H. Sahai and A. Khurshid (1996), Statistics in Epidemiology:
Methods, Techniques, and Applications, CRC Press LLC, Boca
Raton, Florida.
.. [3] Alan Agresti, An Introduction to Categorical Data Analysis
(second edition), Wiley, Hoboken, NJ, USA (2007).
"""
if alternative not in ['two-sided', 'less', 'greater']:
raise ValueError("`alternative` must be 'two-sided', 'less' or "
"'greater'.")
if confidence_level < 0 or confidence_level > 1:
raise ValueError('confidence_level must be between 0 and 1')
if self._kind == 'conditional':
ci = self._conditional_odds_ratio_ci(confidence_level, alternative)
else:
ci = self._sample_odds_ratio_ci(confidence_level, alternative)
return ci
def _conditional_odds_ratio_ci(self, confidence_level=0.95,
alternative='two-sided'):
"""
Confidence interval for the conditional odds ratio.
"""
table = self._table
if 0 in table.sum(axis=0) or 0 in table.sum(axis=1):
# If both values in a row or column are zero, the p-value is 1,
# the odds ratio is NaN and the confidence interval is (0, inf).
ci = (0, np.inf)
else:
ci = _conditional_oddsratio_ci(table,
confidence_level=confidence_level,
alternative=alternative)
return ConfidenceInterval(low=ci[0], high=ci[1])
def _sample_odds_ratio_ci(self, confidence_level=0.95,
alternative='two-sided'):
"""
Confidence interval for the sample odds ratio.
"""
if confidence_level < 0 or confidence_level > 1:
raise ValueError('confidence_level must be between 0 and 1')
table = self._table
if 0 in table.sum(axis=0) or 0 in table.sum(axis=1):
# If both values in a row or column are zero, the p-value is 1,
# the odds ratio is NaN and the confidence interval is (0, inf).
ci = (0, np.inf)
else:
ci = _sample_odds_ratio_ci(table,
confidence_level=confidence_level,
alternative=alternative)
return ConfidenceInterval(low=ci[0], high=ci[1])
def odds_ratio(table, *, kind='conditional'):
r"""
Compute the odds ratio for a 2x2 contingency table.
Parameters
----------
table : array_like of ints
A 2x2 contingency table. Elements must be non-negative integers.
kind : str, optional
Which kind of odds ratio to compute, either the sample
odds ratio (``kind='sample'``) or the conditional odds ratio
(``kind='conditional'``). Default is ``'conditional'``.
Returns
-------
result : `~scipy.stats._result_classes.OddsRatioResult` instance
The returned object has two computed attributes:
statistic : float
* If `kind` is ``'sample'``, this is sample (or unconditional)
estimate, given by
``table[0, 0]*table[1, 1]/(table[0, 1]*table[1, 0])``.
* If `kind` is ``'conditional'``, this is the conditional
maximum likelihood estimate for the odds ratio. It is
the noncentrality parameter of Fisher's noncentral
hypergeometric distribution with the same hypergeometric
parameters as `table` and whose mean is ``table[0, 0]``.
The object has the method `confidence_interval` that computes
the confidence interval of the odds ratio.
See Also
--------
scipy.stats.fisher_exact
relative_risk
Notes
-----
The conditional odds ratio was discussed by Fisher (see "Example 1"
of [1]_). Texts that cover the odds ratio include [2]_ and [3]_.
.. versionadded:: 1.10.0
References
----------
.. [1] R. A. Fisher (1935), The logic of inductive inference,
Journal of the Royal Statistical Society, Vol. 98, No. 1,
pp. 39-82.
.. [2] Breslow NE, Day NE (1980). Statistical methods in cancer research.
Volume I - The analysis of case-control studies. IARC Sci Publ.
(32):5-338. PMID: 7216345. (See section 4.2.)
.. [3] H. Sahai and A. Khurshid (1996), Statistics in Epidemiology:
Methods, Techniques, and Applications, CRC Press LLC, Boca
Raton, Florida.
.. [4] Berger, Jeffrey S. et al. "Aspirin for the Primary Prevention of
Cardiovascular Events in Women and Men: A Sex-Specific
Meta-analysis of Randomized Controlled Trials."
JAMA, 295(3):306-313, :doi:`10.1001/jama.295.3.306`, 2006.
Examples
--------
In epidemiology, individuals are classified as "exposed" or
"unexposed" to some factor or treatment. If the occurrence of some
illness is under study, those who have the illness are often
classified as "cases", and those without it are "noncases". The
counts of the occurrences of these classes gives a contingency
table::
exposed unexposed
cases a b
noncases c d
The sample odds ratio may be written ``(a/c) / (b/d)``. ``a/c`` can
be interpreted as the odds of a case occurring in the exposed group,
and ``b/d`` as the odds of a case occurring in the unexposed group.
The sample odds ratio is the ratio of these odds. If the odds ratio
is greater than 1, it suggests that there is a positive association
between being exposed and being a case.
Interchanging the rows or columns of the contingency table inverts
the odds ratio, so it is important to understand the meaning of labels
given to the rows and columns of the table when interpreting the
odds ratio.
In [4]_, the use of aspirin to prevent cardiovascular events in women
and men was investigated. The study notably concluded:
...aspirin therapy reduced the risk of a composite of
cardiovascular events due to its effect on reducing the risk of
ischemic stroke in women [...]
The article lists studies of various cardiovascular events. Let's
focus on the ischemic stoke in women.
The following table summarizes the results of the experiment in which
participants took aspirin or a placebo on a regular basis for several
years. Cases of ischemic stroke were recorded::
Aspirin Control/Placebo
Ischemic stroke 176 230
No stroke 21035 21018
The question we ask is "Is there evidence that the aspirin reduces the
risk of ischemic stroke?"
Compute the odds ratio:
>>> from scipy.stats.contingency import odds_ratio
>>> res = odds_ratio([[176, 230], [21035, 21018]])
>>> res.statistic
0.7646037659999126
For this sample, the odds of getting an ischemic stroke for those who have
been taking aspirin are 0.76 times that of those
who have received the placebo.
To make statistical inferences about the population under study,
we can compute the 95% confidence interval for the odds ratio:
>>> res.confidence_interval(confidence_level=0.95)
ConfidenceInterval(low=0.6241234078749812, high=0.9354102892100372)
The 95% confidence interval for the conditional odds ratio is
approximately (0.62, 0.94).
The fact that the entire 95% confidence interval falls below 1 supports
the authors' conclusion that the aspirin was associated with a
statistically significant reduction in ischemic stroke.
"""
if kind not in ['conditional', 'sample']:
raise ValueError("`kind` must be 'conditional' or 'sample'.")
c = np.asarray(table)
if c.shape != (2, 2):
raise ValueError(f"Invalid shape {c.shape}. The input `table` must be "
"of shape (2, 2).")
if not np.issubdtype(c.dtype, np.integer):
raise ValueError("`table` must be an array of integers, but got "
f"type {c.dtype}")
c = c.astype(np.int64)
if np.any(c < 0):
raise ValueError("All values in `table` must be nonnegative.")
if 0 in c.sum(axis=0) or 0 in c.sum(axis=1):
# If both values in a row or column are zero, the p-value is NaN and
# the odds ratio is NaN.
result = OddsRatioResult(_table=c, _kind=kind, statistic=np.nan)
return result
if kind == 'sample':
oddsratio = _sample_odds_ratio(c)
else: # kind is 'conditional'
oddsratio = _conditional_oddsratio(c)
result = OddsRatioResult(_table=c, _kind=kind, statistic=oddsratio)
return result

View File

@ -0,0 +1,479 @@
from itertools import permutations
import numpy as np
import math
from ._continuous_distns import norm
import scipy.stats
from dataclasses import dataclass
@dataclass
class PageTrendTestResult:
statistic: float
pvalue: float
method: str
def page_trend_test(data, ranked=False, predicted_ranks=None, method='auto'):
r"""
Perform Page's Test, a measure of trend in observations between treatments.
Page's Test (also known as Page's :math:`L` test) is useful when:
* there are :math:`n \geq 3` treatments,
* :math:`m \geq 2` subjects are observed for each treatment, and
* the observations are hypothesized to have a particular order.
Specifically, the test considers the null hypothesis that
.. math::
m_1 = m_2 = m_3 \cdots = m_n,
where :math:`m_j` is the mean of the observed quantity under treatment
:math:`j`, against the alternative hypothesis that
.. math::
m_1 \leq m_2 \leq m_3 \leq \cdots \leq m_n,
where at least one inequality is strict.
As noted by [4]_, Page's :math:`L` test has greater statistical power than
the Friedman test against the alternative that there is a difference in
trend, as Friedman's test only considers a difference in the means of the
observations without considering their order. Whereas Spearman :math:`\rho`
considers the correlation between the ranked observations of two variables
(e.g. the airspeed velocity of a swallow vs. the weight of the coconut it
carries), Page's :math:`L` is concerned with a trend in an observation
(e.g. the airspeed velocity of a swallow) across several distinct
treatments (e.g. carrying each of five coconuts of different weight) even
as the observation is repeated with multiple subjects (e.g. one European
swallow and one African swallow).
Parameters
----------
data : array-like
A :math:`m \times n` array; the element in row :math:`i` and
column :math:`j` is the observation corresponding with subject
:math:`i` and treatment :math:`j`. By default, the columns are
assumed to be arranged in order of increasing predicted mean.
ranked : boolean, optional
By default, `data` is assumed to be observations rather than ranks;
it will be ranked with `scipy.stats.rankdata` along ``axis=1``. If
`data` is provided in the form of ranks, pass argument ``True``.
predicted_ranks : array-like, optional
The predicted ranks of the column means. If not specified,
the columns are assumed to be arranged in order of increasing
predicted mean, so the default `predicted_ranks` are
:math:`[1, 2, \dots, n-1, n]`.
method : {'auto', 'asymptotic', 'exact'}, optional
Selects the method used to calculate the *p*-value. The following
options are available.
* 'auto': selects between 'exact' and 'asymptotic' to
achieve reasonably accurate results in reasonable time (default)
* 'asymptotic': compares the standardized test statistic against
the normal distribution
* 'exact': computes the exact *p*-value by comparing the observed
:math:`L` statistic against those realized by all possible
permutations of ranks (under the null hypothesis that each
permutation is equally likely)
Returns
-------
res : PageTrendTestResult
An object containing attributes:
statistic : float
Page's :math:`L` test statistic.
pvalue : float
The associated *p*-value
method : {'asymptotic', 'exact'}
The method used to compute the *p*-value
See Also
--------
rankdata, friedmanchisquare, spearmanr
Notes
-----
As noted in [1]_, "the :math:`n` 'treatments' could just as well represent
:math:`n` objects or events or performances or persons or trials ranked."
Similarly, the :math:`m` 'subjects' could equally stand for :math:`m`
"groupings by ability or some other control variable, or judges doing
the ranking, or random replications of some other sort."
The procedure for calculating the :math:`L` statistic, adapted from
[1]_, is:
1. "Predetermine with careful logic the appropriate hypotheses
concerning the predicted ordering of the experimental results.
If no reasonable basis for ordering any treatments is known, the
:math:`L` test is not appropriate."
2. "As in other experiments, determine at what level of confidence
you will reject the null hypothesis that there is no agreement of
experimental results with the monotonic hypothesis."
3. "Cast the experimental material into a two-way table of :math:`n`
columns (treatments, objects ranked, conditions) and :math:`m`
rows (subjects, replication groups, levels of control variables)."
4. "When experimental observations are recorded, rank them across each
row", e.g. ``ranks = scipy.stats.rankdata(data, axis=1)``.
5. "Add the ranks in each column", e.g.
``colsums = np.sum(ranks, axis=0)``.
6. "Multiply each sum of ranks by the predicted rank for that same
column", e.g. ``products = predicted_ranks * colsums``.
7. "Sum all such products", e.g. ``L = products.sum()``.
[1]_ continues by suggesting use of the standardized statistic
.. math::
\chi_L^2 = \frac{\left[12L-3mn(n+1)^2\right]^2}{mn^2(n^2-1)(n+1)}
"which is distributed approximately as chi-square with 1 degree of
freedom. The ordinary use of :math:`\chi^2` tables would be
equivalent to a two-sided test of agreement. If a one-sided test
is desired, *as will almost always be the case*, the probability
discovered in the chi-square table should be *halved*."
However, this standardized statistic does not distinguish between the
observed values being well correlated with the predicted ranks and being
_anti_-correlated with the predicted ranks. Instead, we follow [2]_
and calculate the standardized statistic
.. math::
\Lambda = \frac{L - E_0}{\sqrt{V_0}},
where :math:`E_0 = \frac{1}{4} mn(n+1)^2` and
:math:`V_0 = \frac{1}{144} mn^2(n+1)(n^2-1)`, "which is asymptotically
normal under the null hypothesis".
The *p*-value for ``method='exact'`` is generated by comparing the observed
value of :math:`L` against the :math:`L` values generated for all
:math:`(n!)^m` possible permutations of ranks. The calculation is performed
using the recursive method of [5].
The *p*-values are not adjusted for the possibility of ties. When
ties are present, the reported ``'exact'`` *p*-values may be somewhat
larger (i.e. more conservative) than the true *p*-value [2]_. The
``'asymptotic'``` *p*-values, however, tend to be smaller (i.e. less
conservative) than the ``'exact'`` *p*-values.
References
----------
.. [1] Ellis Batten Page, "Ordered hypotheses for multiple treatments:
a significant test for linear ranks", *Journal of the American
Statistical Association* 58(301), p. 216--230, 1963.
.. [2] Markus Neuhauser, *Nonparametric Statistical Test: A computational
approach*, CRC Press, p. 150--152, 2012.
.. [3] Statext LLC, "Page's L Trend Test - Easy Statistics", *Statext -
Statistics Study*, https://www.statext.com/practice/PageTrendTest03.php,
Accessed July 12, 2020.
.. [4] "Page's Trend Test", *Wikipedia*, WikimediaFoundation,
https://en.wikipedia.org/wiki/Page%27s_trend_test,
Accessed July 12, 2020.
.. [5] Robert E. Odeh, "The exact distribution of Page's L-statistic in
the two-way layout", *Communications in Statistics - Simulation and
Computation*, 6(1), p. 49--61, 1977.
Examples
--------
We use the example from [3]_: 10 students are asked to rate three
teaching methods - tutorial, lecture, and seminar - on a scale of 1-5,
with 1 being the lowest and 5 being the highest. We have decided that
a confidence level of 99% is required to reject the null hypothesis in
favor of our alternative: that the seminar will have the highest ratings
and the tutorial will have the lowest. Initially, the data have been
tabulated with each row representing an individual student's ratings of
the three methods in the following order: tutorial, lecture, seminar.
>>> table = [[3, 4, 3],
... [2, 2, 4],
... [3, 3, 5],
... [1, 3, 2],
... [2, 3, 2],
... [2, 4, 5],
... [1, 2, 4],
... [3, 4, 4],
... [2, 4, 5],
... [1, 3, 4]]
Because the tutorial is hypothesized to have the lowest ratings, the
column corresponding with tutorial rankings should be first; the seminar
is hypothesized to have the highest ratings, so its column should be last.
Since the columns are already arranged in this order of increasing
predicted mean, we can pass the table directly into `page_trend_test`.
>>> from scipy.stats import page_trend_test
>>> res = page_trend_test(table)
>>> res
PageTrendTestResult(statistic=133.5, pvalue=0.0018191161948127822,
method='exact')
This *p*-value indicates that there is a 0.1819% chance that
the :math:`L` statistic would reach such an extreme value under the null
hypothesis. Because 0.1819% is less than 1%, we have evidence to reject
the null hypothesis in favor of our alternative at a 99% confidence level.
The value of the :math:`L` statistic is 133.5. To check this manually,
we rank the data such that high scores correspond with high ranks, settling
ties with an average rank:
>>> from scipy.stats import rankdata
>>> ranks = rankdata(table, axis=1)
>>> ranks
array([[1.5, 3. , 1.5],
[1.5, 1.5, 3. ],
[1.5, 1.5, 3. ],
[1. , 3. , 2. ],
[1.5, 3. , 1.5],
[1. , 2. , 3. ],
[1. , 2. , 3. ],
[1. , 2.5, 2.5],
[1. , 2. , 3. ],
[1. , 2. , 3. ]])
We add the ranks within each column, multiply the sums by the
predicted ranks, and sum the products.
>>> import numpy as np
>>> m, n = ranks.shape
>>> predicted_ranks = np.arange(1, n+1)
>>> L = (predicted_ranks * np.sum(ranks, axis=0)).sum()
>>> res.statistic == L
True
As presented in [3]_, the asymptotic approximation of the *p*-value is the
survival function of the normal distribution evaluated at the standardized
test statistic:
>>> from scipy.stats import norm
>>> E0 = (m*n*(n+1)**2)/4
>>> V0 = (m*n**2*(n+1)*(n**2-1))/144
>>> Lambda = (L-E0)/np.sqrt(V0)
>>> p = norm.sf(Lambda)
>>> p
0.0012693433690751756
This does not precisely match the *p*-value reported by `page_trend_test`
above. The asymptotic distribution is not very accurate, nor conservative,
for :math:`m \leq 12` and :math:`n \leq 8`, so `page_trend_test` chose to
use ``method='exact'`` based on the dimensions of the table and the
recommendations in Page's original paper [1]_. To override
`page_trend_test`'s choice, provide the `method` argument.
>>> res = page_trend_test(table, method="asymptotic")
>>> res
PageTrendTestResult(statistic=133.5, pvalue=0.0012693433690751756,
method='asymptotic')
If the data are already ranked, we can pass in the ``ranks`` instead of
the ``table`` to save computation time.
>>> res = page_trend_test(ranks, # ranks of data
... ranked=True, # data is already ranked
... )
>>> res
PageTrendTestResult(statistic=133.5, pvalue=0.0018191161948127822,
method='exact')
Suppose the raw data had been tabulated in an order different from the
order of predicted means, say lecture, seminar, tutorial.
>>> table = np.asarray(table)[:, [1, 2, 0]]
Since the arrangement of this table is not consistent with the assumed
ordering, we can either rearrange the table or provide the
`predicted_ranks`. Remembering that the lecture is predicted
to have the middle rank, the seminar the highest, and tutorial the lowest,
we pass:
>>> res = page_trend_test(table, # data as originally tabulated
... predicted_ranks=[2, 3, 1], # our predicted order
... )
>>> res
PageTrendTestResult(statistic=133.5, pvalue=0.0018191161948127822,
method='exact')
"""
# Possible values of the method parameter and the corresponding function
# used to evaluate the p value
methods = {"asymptotic": _l_p_asymptotic,
"exact": _l_p_exact,
"auto": None}
if method not in methods:
raise ValueError(f"`method` must be in {set(methods)}")
ranks = np.asarray(data)
if ranks.ndim != 2: # TODO: relax this to accept 3d arrays?
raise ValueError("`data` must be a 2d array.")
m, n = ranks.shape
if m < 2 or n < 3:
raise ValueError("Page's L is only appropriate for data with two "
"or more rows and three or more columns.")
if np.any(np.isnan(data)):
raise ValueError("`data` contains NaNs, which cannot be ranked "
"meaningfully")
# ensure NumPy array and rank the data if it's not already ranked
if ranked:
# Only a basic check on whether data is ranked. Checking that the data
# is properly ranked could take as much time as ranking it.
if not (ranks.min() >= 1 and ranks.max() <= ranks.shape[1]):
raise ValueError("`data` is not properly ranked. Rank the data or "
"pass `ranked=False`.")
else:
ranks = scipy.stats.rankdata(data, axis=-1)
# generate predicted ranks if not provided, ensure valid NumPy array
if predicted_ranks is None:
predicted_ranks = np.arange(1, n+1)
else:
predicted_ranks = np.asarray(predicted_ranks)
if (predicted_ranks.ndim < 1 or
(set(predicted_ranks) != set(range(1, n+1)) or
len(predicted_ranks) != n)):
raise ValueError(f"`predicted_ranks` must include each integer "
f"from 1 to {n} (the number of columns in "
f"`data`) exactly once.")
if not isinstance(ranked, bool):
raise TypeError("`ranked` must be boolean.")
# Calculate the L statistic
L = _l_vectorized(ranks, predicted_ranks)
# Calculate the p-value
if method == "auto":
method = _choose_method(ranks)
p_fun = methods[method] # get the function corresponding with the method
p = p_fun(L, m, n)
page_result = PageTrendTestResult(statistic=L, pvalue=p, method=method)
return page_result
def _choose_method(ranks):
'''Choose method for computing p-value automatically'''
m, n = ranks.shape
if n > 8 or (m > 12 and n > 3) or m > 20: # as in [1], [4]
method = "asymptotic"
else:
method = "exact"
return method
def _l_vectorized(ranks, predicted_ranks):
'''Calculate's Page's L statistic for each page of a 3d array'''
colsums = ranks.sum(axis=-2, keepdims=True)
products = predicted_ranks * colsums
Ls = products.sum(axis=-1)
Ls = Ls[0] if Ls.size == 1 else Ls.ravel()
return Ls
def _l_p_asymptotic(L, m, n):
'''Calculate the p-value of Page's L from the asymptotic distribution'''
# Using [1] as a reference, the asymptotic p-value would be calculated as:
# chi_L = (12*L - 3*m*n*(n+1)**2)**2/(m*n**2*(n**2-1)*(n+1))
# p = chi2.sf(chi_L, df=1, loc=0, scale=1)/2
# but this is insensitive to the direction of the hypothesized ranking
# See [2] page 151
E0 = (m*n*(n+1)**2)/4
V0 = (m*n**2*(n+1)*(n**2-1))/144
Lambda = (L-E0)/np.sqrt(V0)
# This is a one-sided "greater" test - calculate the probability that the
# L statistic under H0 would be greater than the observed L statistic
p = norm.sf(Lambda)
return p
def _l_p_exact(L, m, n):
'''Calculate the p-value of Page's L exactly'''
# [1] uses m, n; [5] uses n, k.
# Switch convention here because exact calculation code references [5].
L, n, k = int(L), int(m), int(n)
_pagel_state.set_k(k)
return _pagel_state.sf(L, n)
class _PageL:
'''Maintains state between `page_trend_test` executions'''
def __init__(self):
'''Lightweight initialization'''
self.all_pmfs = {}
def set_k(self, k):
'''Calculate lower and upper limits of L for single row'''
self.k = k
# See [5] top of page 52
self.a, self.b = (k*(k+1)*(k+2))//6, (k*(k+1)*(2*k+1))//6
def sf(self, l, n):
'''Survival function of Page's L statistic'''
ps = [self.pmf(l, n) for l in range(l, n*self.b + 1)]
return np.sum(ps)
def p_l_k_1(self):
'''Relative frequency of each L value over all possible single rows'''
# See [5] Equation (6)
ranks = range(1, self.k+1)
# generate all possible rows of length k
rank_perms = np.array(list(permutations(ranks)))
# compute Page's L for all possible rows
Ls = (ranks*rank_perms).sum(axis=1)
# count occurrences of each L value
counts = np.histogram(Ls, np.arange(self.a-0.5, self.b+1.5))[0]
# factorial(k) is number of possible permutations
return counts/math.factorial(self.k)
def pmf(self, l, n):
'''Recursive function to evaluate p(l, k, n); see [5] Equation 1'''
if n not in self.all_pmfs:
self.all_pmfs[n] = {}
if self.k not in self.all_pmfs[n]:
self.all_pmfs[n][self.k] = {}
# Cache results to avoid repeating calculation. Initially this was
# written with lru_cache, but this seems faster? Also, we could add
# an option to save this for future lookup.
if l in self.all_pmfs[n][self.k]:
return self.all_pmfs[n][self.k][l]
if n == 1:
ps = self.p_l_k_1() # [5] Equation 6
ls = range(self.a, self.b+1)
# not fast, but we'll only be here once
self.all_pmfs[n][self.k] = {l: p for l, p in zip(ls, ps)}
return self.all_pmfs[n][self.k][l]
p = 0
low = max(l-(n-1)*self.b, self.a) # [5] Equation 2
high = min(l-(n-1)*self.a, self.b)
# [5] Equation 1
for t in range(low, high+1):
p1 = self.pmf(l-t, n-1)
p2 = self.pmf(t, 1)
p += p1*p2
self.all_pmfs[n][self.k][l] = p
return p
# Maintain state for faster repeat calls to page_trend_test w/ method='exact'
_pagel_state = _PageL()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,54 @@
import numpy as np
from scipy._lib._util import DecimalNumber, IntNumber
def _cy_wrapper_centered_discrepancy(
sample: np.ndarray,
iterative: bool,
workers: IntNumber,
) -> float: ...
def _cy_wrapper_wrap_around_discrepancy(
sample: np.ndarray,
iterative: bool,
workers: IntNumber,
) -> float: ...
def _cy_wrapper_mixture_discrepancy(
sample: np.ndarray,
iterative: bool,
workers: IntNumber,
) -> float: ...
def _cy_wrapper_l2_star_discrepancy(
sample: np.ndarray,
iterative: bool,
workers: IntNumber,
) -> float: ...
def _cy_wrapper_update_discrepancy(
x_new_view: np.ndarray,
sample_view: np.ndarray,
initial_disc: DecimalNumber,
) -> float: ...
def _cy_van_der_corput(
n: IntNumber,
base: IntNumber,
start_index: IntNumber,
workers: IntNumber,
) -> np.ndarray: ...
def _cy_van_der_corput_scrambled(
n: IntNumber,
base: IntNumber,
start_index: IntNumber,
permutations: np.ndarray,
workers: IntNumber,
) -> np.ndarray: ...

View File

@ -0,0 +1,533 @@
# Integration of multivariate normal and t distributions.
# Adapted from the MATLAB original implementations by Dr. Alan Genz.
# http://www.math.wsu.edu/faculty/genz/software/software.html
# Copyright (C) 2013, Alan Genz, All rights reserved.
# Python implementation is copyright (C) 2022, Robert Kern, All rights
# reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided the following conditions are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# 3. The contributor name(s) may not be used to endorse or promote
# products derived from this software without specific prior
# written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import numpy as np
from scipy.fft import fft, ifft
from scipy.special import gammaincinv, ndtr, ndtri
from scipy.stats._qmc import primes_from_2_to
phi = ndtr
phinv = ndtri
def _factorize_int(n):
"""Return a sorted list of the unique prime factors of a positive integer.
"""
# NOTE: There are lots faster ways to do this, but this isn't terrible.
factors = set()
for p in primes_from_2_to(int(np.sqrt(n)) + 1):
while not (n % p):
factors.add(p)
n //= p
if n == 1:
break
if n != 1:
factors.add(n)
return sorted(factors)
def _primitive_root(p):
"""Compute a primitive root of the prime number `p`.
Used in the CBC lattice construction.
References
----------
.. [1] https://en.wikipedia.org/wiki/Primitive_root_modulo_n
"""
# p is prime
pm = p - 1
factors = _factorize_int(pm)
n = len(factors)
r = 2
k = 0
while k < n:
d = pm // factors[k]
# pow() doesn't like numpy scalar types.
rd = pow(int(r), int(d), int(p))
if rd == 1:
r += 1
k = 0
else:
k += 1
return r
def _cbc_lattice(n_dim, n_qmc_samples):
"""Compute a QMC lattice generator using a Fast CBC construction.
Parameters
----------
n_dim : int > 0
The number of dimensions for the lattice.
n_qmc_samples : int > 0
The desired number of QMC samples. This will be rounded down to the
nearest prime to enable the CBC construction.
Returns
-------
q : float array : shape=(n_dim,)
The lattice generator vector. All values are in the open interval
`(0, 1)`.
actual_n_qmc_samples : int
The prime number of QMC samples that must be used with this lattice,
no more, no less.
References
----------
.. [1] Nuyens, D. and Cools, R. "Fast Component-by-Component Construction,
a Reprise for Different Kernels", In H. Niederreiter and D. Talay,
editors, Monte-Carlo and Quasi-Monte Carlo Methods 2004,
Springer-Verlag, 2006, 371-385.
"""
# Round down to the nearest prime number.
primes = primes_from_2_to(n_qmc_samples + 1)
n_qmc_samples = primes[-1]
bt = np.ones(n_dim)
gm = np.hstack([1.0, 0.8 ** np.arange(n_dim - 1)])
q = 1
w = 0
z = np.arange(1, n_dim + 1)
m = (n_qmc_samples - 1) // 2
g = _primitive_root(n_qmc_samples)
# Slightly faster way to compute perm[j] = pow(g, j, n_qmc_samples)
# Shame that we don't have modulo pow() implemented as a ufunc.
perm = np.ones(m, dtype=int)
for j in range(m - 1):
perm[j + 1] = (g * perm[j]) % n_qmc_samples
perm = np.minimum(n_qmc_samples - perm, perm)
pn = perm / n_qmc_samples
c = pn * pn - pn + 1.0 / 6
fc = fft(c)
for s in range(1, n_dim):
reordered = np.hstack([
c[:w+1][::-1],
c[w+1:m][::-1],
])
q = q * (bt[s-1] + gm[s-1] * reordered)
w = ifft(fc * fft(q)).real.argmin()
z[s] = perm[w]
q = z / n_qmc_samples
return q, n_qmc_samples
# Note: this function is not currently used or tested by any SciPy code. It is
# included in this file to facilitate the development of a parameter for users
# to set the desired CDF accuracy, but must be reviewed and tested before use.
def _qauto(func, covar, low, high, rng, error=1e-3, limit=10_000, **kwds):
"""Automatically rerun the integration to get the required error bound.
Parameters
----------
func : callable
Either :func:`_qmvn` or :func:`_qmvt`.
covar, low, high : array
As specified in :func:`_qmvn` and :func:`_qmvt`.
rng : Generator, optional
default_rng(), yada, yada
error : float > 0
The desired error bound.
limit : int > 0:
The rough limit of the number of integration points to consider. The
integration will stop looping once this limit has been *exceeded*.
**kwds :
Other keyword arguments to pass to `func`. When using :func:`_qmvt`, be
sure to include ``nu=`` as one of these.
Returns
-------
prob : float
The estimated probability mass within the bounds.
est_error : float
3 times the standard error of the batch estimates.
n_samples : int
The number of integration points actually used.
"""
n = len(covar)
n_samples = 0
if n == 1:
prob = phi(high) - phi(low)
# More or less
est_error = 1e-15
else:
mi = min(limit, n * 1000)
prob = 0.0
est_error = 1.0
ei = 0.0
while est_error > error and n_samples < limit:
mi = round(np.sqrt(2) * mi)
pi, ei, ni = func(mi, covar, low, high, rng=rng, **kwds)
n_samples += ni
wt = 1.0 / (1 + (ei / est_error)**2)
prob += wt * (pi - prob)
est_error = np.sqrt(wt) * ei
return prob, est_error, n_samples
# Note: this function is not currently used or tested by any SciPy code. It is
# included in this file to facilitate the resolution of gh-8367, gh-16142, and
# possibly gh-14286, but must be reviewed and tested before use.
def _qmvn(m, covar, low, high, rng, lattice='cbc', n_batches=10):
"""Multivariate normal integration over box bounds.
Parameters
----------
m : int > n_batches
The number of points to sample. This number will be divided into
`n_batches` batches that apply random offsets of the sampling lattice
for each batch in order to estimate the error.
covar : (n, n) float array
Possibly singular, positive semidefinite symmetric covariance matrix.
low, high : (n,) float array
The low and high integration bounds.
rng : Generator, optional
default_rng(), yada, yada
lattice : 'cbc' or callable
The type of lattice rule to use to construct the integration points.
n_batches : int > 0, optional
The number of QMC batches to apply.
Returns
-------
prob : float
The estimated probability mass within the bounds.
est_error : float
3 times the standard error of the batch estimates.
"""
cho, lo, hi = _permuted_cholesky(covar, low, high)
n = cho.shape[0]
ct = cho[0, 0]
c = phi(lo[0] / ct)
d = phi(hi[0] / ct)
ci = c
dci = d - ci
prob = 0.0
error_var = 0.0
q, n_qmc_samples = _cbc_lattice(n - 1, max(m // n_batches, 1))
y = np.zeros((n - 1, n_qmc_samples))
i_samples = np.arange(n_qmc_samples) + 1
for j in range(n_batches):
c = np.full(n_qmc_samples, ci)
dc = np.full(n_qmc_samples, dci)
pv = dc.copy()
for i in range(1, n):
# Pseudorandomly-shifted lattice coordinate.
z = q[i - 1] * i_samples + rng.random()
# Fast remainder(z, 1.0)
z -= z.astype(int)
# Tent periodization transform.
x = abs(2 * z - 1)
y[i - 1, :] = phinv(c + x * dc)
s = cho[i, :i] @ y[:i, :]
ct = cho[i, i]
c = phi((lo[i] - s) / ct)
d = phi((hi[i] - s) / ct)
dc = d - c
pv = pv * dc
# Accumulate the mean and error variances with online formulations.
d = (pv.mean() - prob) / (j + 1)
prob += d
error_var = (j - 1) * error_var / (j + 1) + d * d
# Error bounds are 3 times the standard error of the estimates.
est_error = 3 * np.sqrt(error_var)
n_samples = n_qmc_samples * n_batches
return prob, est_error, n_samples
# Note: this function is not currently used or tested by any SciPy code. It is
# included in this file to facilitate the resolution of gh-8367, gh-16142, and
# possibly gh-14286, but must be reviewed and tested before use.
def _mvn_qmc_integrand(covar, low, high, use_tent=False):
"""Transform the multivariate normal integration into a QMC integrand over
a unit hypercube.
The dimensionality of the resulting hypercube integration domain is one
less than the dimensionality of the original integrand. Note that this
transformation subsumes the integration bounds in order to account for
infinite bounds. The QMC integration one does with the returned integrand
should be on the unit hypercube.
Parameters
----------
covar : (n, n) float array
Possibly singular, positive semidefinite symmetric covariance matrix.
low, high : (n,) float array
The low and high integration bounds.
use_tent : bool, optional
If True, then use tent periodization. Only helpful for lattice rules.
Returns
-------
integrand : Callable[[NDArray], NDArray]
The QMC-integrable integrand. It takes an
``(n_qmc_samples, ndim_integrand)`` array of QMC samples in the unit
hypercube and returns the ``(n_qmc_samples,)`` evaluations of at these
QMC points.
ndim_integrand : int
The dimensionality of the integrand. Equal to ``n-1``.
"""
cho, lo, hi = _permuted_cholesky(covar, low, high)
n = cho.shape[0]
ndim_integrand = n - 1
ct = cho[0, 0]
c = phi(lo[0] / ct)
d = phi(hi[0] / ct)
ci = c
dci = d - ci
def integrand(*zs):
ndim_qmc = len(zs)
n_qmc_samples = len(np.atleast_1d(zs[0]))
assert ndim_qmc == ndim_integrand
y = np.zeros((ndim_qmc, n_qmc_samples))
c = np.full(n_qmc_samples, ci)
dc = np.full(n_qmc_samples, dci)
pv = dc.copy()
for i in range(1, n):
if use_tent:
# Tent periodization transform.
x = abs(2 * zs[i-1] - 1)
else:
x = zs[i-1]
y[i - 1, :] = phinv(c + x * dc)
s = cho[i, :i] @ y[:i, :]
ct = cho[i, i]
c = phi((lo[i] - s) / ct)
d = phi((hi[i] - s) / ct)
dc = d - c
pv = pv * dc
return pv
return integrand, ndim_integrand
def _qmvt(m, nu, covar, low, high, rng, lattice='cbc', n_batches=10):
"""Multivariate t integration over box bounds.
Parameters
----------
m : int > n_batches
The number of points to sample. This number will be divided into
`n_batches` batches that apply random offsets of the sampling lattice
for each batch in order to estimate the error.
nu : float >= 0
The shape parameter of the multivariate t distribution.
covar : (n, n) float array
Possibly singular, positive semidefinite symmetric covariance matrix.
low, high : (n,) float array
The low and high integration bounds.
rng : Generator, optional
default_rng(), yada, yada
lattice : 'cbc' or callable
The type of lattice rule to use to construct the integration points.
n_batches : int > 0, optional
The number of QMC batches to apply.
Returns
-------
prob : float
The estimated probability mass within the bounds.
est_error : float
3 times the standard error of the batch estimates.
n_samples : int
The number of samples actually used.
"""
sn = max(1.0, np.sqrt(nu))
low = np.asarray(low, dtype=np.float64)
high = np.asarray(high, dtype=np.float64)
cho, lo, hi = _permuted_cholesky(covar, low / sn, high / sn)
n = cho.shape[0]
prob = 0.0
error_var = 0.0
q, n_qmc_samples = _cbc_lattice(n, max(m // n_batches, 1))
i_samples = np.arange(n_qmc_samples) + 1
for j in range(n_batches):
pv = np.ones(n_qmc_samples)
s = np.zeros((n, n_qmc_samples))
for i in range(n):
# Pseudorandomly-shifted lattice coordinate.
z = q[i] * i_samples + rng.random()
# Fast remainder(z, 1.0)
z -= z.astype(int)
# Tent periodization transform.
x = abs(2 * z - 1)
# FIXME: Lift the i==0 case out of the loop to make the logic
# easier to follow.
if i == 0:
# We'll use one of the QR variates to pull out the
# t-distribution scaling.
if nu > 0:
r = np.sqrt(2 * gammaincinv(nu / 2, x))
else:
r = np.ones_like(x)
else:
y = phinv(c + x * dc) # noqa: F821
with np.errstate(invalid='ignore'):
s[i:, :] += cho[i:, i - 1][:, np.newaxis] * y
si = s[i, :]
c = np.ones(n_qmc_samples)
d = np.ones(n_qmc_samples)
with np.errstate(invalid='ignore'):
lois = lo[i] * r - si
hiis = hi[i] * r - si
c[lois < -9] = 0.0
d[hiis < -9] = 0.0
lo_mask = abs(lois) < 9
hi_mask = abs(hiis) < 9
c[lo_mask] = phi(lois[lo_mask])
d[hi_mask] = phi(hiis[hi_mask])
dc = d - c
pv *= dc
# Accumulate the mean and error variances with online formulations.
d = (pv.mean() - prob) / (j + 1)
prob += d
error_var = (j - 1) * error_var / (j + 1) + d * d
# Error bounds are 3 times the standard error of the estimates.
est_error = 3 * np.sqrt(error_var)
n_samples = n_qmc_samples * n_batches
return prob, est_error, n_samples
def _permuted_cholesky(covar, low, high, tol=1e-10):
"""Compute a scaled, permuted Cholesky factor, with integration bounds.
The scaling and permuting of the dimensions accomplishes part of the
transformation of the original integration problem into a more numerically
tractable form. The lower-triangular Cholesky factor will then be used in
the subsequent integration. The integration bounds will be scaled and
permuted as well.
Parameters
----------
covar : (n, n) float array
Possibly singular, positive semidefinite symmetric covariance matrix.
low, high : (n,) float array
The low and high integration bounds.
tol : float, optional
The singularity tolerance.
Returns
-------
cho : (n, n) float array
Lower Cholesky factor, scaled and permuted.
new_low, new_high : (n,) float array
The scaled and permuted low and high integration bounds.
"""
# Make copies for outputting.
cho = np.array(covar, dtype=np.float64)
new_lo = np.array(low, dtype=np.float64)
new_hi = np.array(high, dtype=np.float64)
n = cho.shape[0]
if cho.shape != (n, n):
raise ValueError("expected a square symmetric array")
if new_lo.shape != (n,) or new_hi.shape != (n,):
raise ValueError(
"expected integration boundaries the same dimensions "
"as the covariance matrix"
)
# Scale by the sqrt of the diagonal.
dc = np.sqrt(np.maximum(np.diag(cho), 0.0))
# But don't divide by 0.
dc[dc == 0.0] = 1.0
new_lo /= dc
new_hi /= dc
cho /= dc
cho /= dc[:, np.newaxis]
y = np.zeros(n)
sqtp = np.sqrt(2 * np.pi)
for k in range(n):
epk = (k + 1) * tol
im = k
ck = 0.0
dem = 1.0
s = 0.0
lo_m = 0.0
hi_m = 0.0
for i in range(k, n):
if cho[i, i] > tol:
ci = np.sqrt(cho[i, i])
if i > 0:
s = cho[i, :k] @ y[:k]
lo_i = (new_lo[i] - s) / ci
hi_i = (new_hi[i] - s) / ci
de = phi(hi_i) - phi(lo_i)
if de <= dem:
ck = ci
dem = de
lo_m = lo_i
hi_m = hi_i
im = i
if im > k:
# Swap im and k
cho[im, im] = cho[k, k]
_swap_slices(cho, np.s_[im, :k], np.s_[k, :k])
_swap_slices(cho, np.s_[im + 1:, im], np.s_[im + 1:, k])
_swap_slices(cho, np.s_[k + 1:im, k], np.s_[im, k + 1:im])
_swap_slices(new_lo, k, im)
_swap_slices(new_hi, k, im)
if ck > epk:
cho[k, k] = ck
cho[k, k + 1:] = 0.0
for i in range(k + 1, n):
cho[i, k] /= ck
cho[i, k + 1:i + 1] -= cho[i, k] * cho[k + 1:i + 1, k]
if abs(dem) > tol:
y[k] = ((np.exp(-lo_m * lo_m / 2) - np.exp(-hi_m * hi_m / 2)) /
(sqtp * dem))
else:
y[k] = (lo_m + hi_m) / 2
if lo_m < -10:
y[k] = hi_m
elif hi_m > 10:
y[k] = lo_m
cho[k, :k + 1] /= ck
new_lo[k] /= ck
new_hi[k] /= ck
else:
cho[k:, k] = 0.0
y[k] = (new_lo[k] + new_hi[k]) / 2
return cho, new_lo, new_hi
def _swap_slices(x, slc1, slc2):
t = x[slc1].copy()
x[slc1] = x[slc2].copy()
x[slc2] = t

View File

@ -0,0 +1,4 @@
#
from .rcont import rvs_rcont1, rvs_rcont2
__all__ = ["rvs_rcont1", "rvs_rcont2"]

View File

@ -0,0 +1,263 @@
import operator
from dataclasses import dataclass
import numpy as np
from scipy.special import ndtri
from ._common import ConfidenceInterval
def _validate_int(n, bound, name):
msg = f'{name} must be an integer not less than {bound}, but got {n!r}'
try:
n = operator.index(n)
except TypeError:
raise TypeError(msg) from None
if n < bound:
raise ValueError(msg)
return n
@dataclass
class RelativeRiskResult:
"""
Result of `scipy.stats.contingency.relative_risk`.
Attributes
----------
relative_risk : float
This is::
(exposed_cases/exposed_total) / (control_cases/control_total)
exposed_cases : int
The number of "cases" (i.e. occurrence of disease or other event
of interest) among the sample of "exposed" individuals.
exposed_total : int
The total number of "exposed" individuals in the sample.
control_cases : int
The number of "cases" among the sample of "control" or non-exposed
individuals.
control_total : int
The total number of "control" individuals in the sample.
Methods
-------
confidence_interval :
Compute the confidence interval for the relative risk estimate.
"""
relative_risk: float
exposed_cases: int
exposed_total: int
control_cases: int
control_total: int
def confidence_interval(self, confidence_level=0.95):
"""
Compute the confidence interval for the relative risk.
The confidence interval is computed using the Katz method
(i.e. "Method C" of [1]_; see also [2]_, section 3.1.2).
Parameters
----------
confidence_level : float, optional
The confidence level to use for the confidence interval.
Default is 0.95.
Returns
-------
ci : ConfidenceInterval instance
The return value is an object with attributes ``low`` and
``high`` that hold the confidence interval.
References
----------
.. [1] D. Katz, J. Baptista, S. P. Azen and M. C. Pike, "Obtaining
confidence intervals for the risk ratio in cohort studies",
Biometrics, 34, 469-474 (1978).
.. [2] Hardeo Sahai and Anwer Khurshid, Statistics in Epidemiology,
CRC Press LLC, Boca Raton, FL, USA (1996).
Examples
--------
>>> from scipy.stats.contingency import relative_risk
>>> result = relative_risk(exposed_cases=10, exposed_total=75,
... control_cases=12, control_total=225)
>>> result.relative_risk
2.5
>>> result.confidence_interval()
ConfidenceInterval(low=1.1261564003469628, high=5.549850800541033)
"""
if not 0 <= confidence_level <= 1:
raise ValueError('confidence_level must be in the interval '
'[0, 1].')
# Handle edge cases where either exposed_cases or control_cases
# is zero. We follow the convention of the R function riskratio
# from the epitools library.
if self.exposed_cases == 0 and self.control_cases == 0:
# relative risk is nan.
return ConfidenceInterval(low=np.nan, high=np.nan)
elif self.exposed_cases == 0:
# relative risk is 0.
return ConfidenceInterval(low=0.0, high=np.nan)
elif self.control_cases == 0:
# relative risk is inf
return ConfidenceInterval(low=np.nan, high=np.inf)
alpha = 1 - confidence_level
z = ndtri(1 - alpha/2)
rr = self.relative_risk
# Estimate of the variance of log(rr) is
# var(log(rr)) = 1/exposed_cases - 1/exposed_total +
# 1/control_cases - 1/control_total
# and the standard error is the square root of that.
se = np.sqrt(1/self.exposed_cases - 1/self.exposed_total +
1/self.control_cases - 1/self.control_total)
delta = z*se
katz_lo = rr*np.exp(-delta)
katz_hi = rr*np.exp(delta)
return ConfidenceInterval(low=katz_lo, high=katz_hi)
def relative_risk(exposed_cases, exposed_total, control_cases, control_total):
"""
Compute the relative risk (also known as the risk ratio).
This function computes the relative risk associated with a 2x2
contingency table ([1]_, section 2.2.3; [2]_, section 3.1.2). Instead
of accepting a table as an argument, the individual numbers that are
used to compute the relative risk are given as separate parameters.
This is to avoid the ambiguity of which row or column of the contingency
table corresponds to the "exposed" cases and which corresponds to the
"control" cases. Unlike, say, the odds ratio, the relative risk is not
invariant under an interchange of the rows or columns.
Parameters
----------
exposed_cases : nonnegative int
The number of "cases" (i.e. occurrence of disease or other event
of interest) among the sample of "exposed" individuals.
exposed_total : positive int
The total number of "exposed" individuals in the sample.
control_cases : nonnegative int
The number of "cases" among the sample of "control" or non-exposed
individuals.
control_total : positive int
The total number of "control" individuals in the sample.
Returns
-------
result : instance of `~scipy.stats._result_classes.RelativeRiskResult`
The object has the float attribute ``relative_risk``, which is::
rr = (exposed_cases/exposed_total) / (control_cases/control_total)
The object also has the method ``confidence_interval`` to compute
the confidence interval of the relative risk for a given confidence
level.
See Also
--------
odds_ratio
Notes
-----
The R package epitools has the function `riskratio`, which accepts
a table with the following layout::
disease=0 disease=1
exposed=0 (ref) n00 n01
exposed=1 n10 n11
With a 2x2 table in the above format, the estimate of the CI is
computed by `riskratio` when the argument method="wald" is given,
or with the function `riskratio.wald`.
For example, in a test of the incidence of lung cancer among a
sample of smokers and nonsmokers, the "exposed" category would
correspond to "is a smoker" and the "disease" category would
correspond to "has or had lung cancer".
To pass the same data to ``relative_risk``, use::
relative_risk(n11, n10 + n11, n01, n00 + n01)
.. versionadded:: 1.7.0
References
----------
.. [1] Alan Agresti, An Introduction to Categorical Data Analysis
(second edition), Wiley, Hoboken, NJ, USA (2007).
.. [2] Hardeo Sahai and Anwer Khurshid, Statistics in Epidemiology,
CRC Press LLC, Boca Raton, FL, USA (1996).
Examples
--------
>>> from scipy.stats.contingency import relative_risk
This example is from Example 3.1 of [2]_. The results of a heart
disease study are summarized in the following table::
High CAT Low CAT Total
-------- ------- -----
CHD 27 44 71
No CHD 95 443 538
Total 122 487 609
CHD is coronary heart disease, and CAT refers to the level of
circulating catecholamine. CAT is the "exposure" variable, and
high CAT is the "exposed" category. So the data from the table
to be passed to ``relative_risk`` is::
exposed_cases = 27
exposed_total = 122
control_cases = 44
control_total = 487
>>> result = relative_risk(27, 122, 44, 487)
>>> result.relative_risk
2.4495156482861398
Find the confidence interval for the relative risk.
>>> result.confidence_interval(confidence_level=0.95)
ConfidenceInterval(low=1.5836990926700116, high=3.7886786315466354)
The interval does not contain 1, so the data supports the statement
that high CAT is associated with greater risk of CHD.
"""
# Relative risk is a trivial calculation. The nontrivial part is in the
# `confidence_interval` method of the RelativeRiskResult class.
exposed_cases = _validate_int(exposed_cases, 0, "exposed_cases")
exposed_total = _validate_int(exposed_total, 1, "exposed_total")
control_cases = _validate_int(control_cases, 0, "control_cases")
control_total = _validate_int(control_total, 1, "control_total")
if exposed_cases > exposed_total:
raise ValueError('exposed_cases must not exceed exposed_total.')
if control_cases > control_total:
raise ValueError('control_cases must not exceed control_total.')
if exposed_cases == 0 and control_cases == 0:
# relative risk is 0/0.
rr = np.nan
elif exposed_cases == 0:
# relative risk is 0/nonzero
rr = 0.0
elif control_cases == 0:
# relative risk is nonzero/0.
rr = np.inf
else:
p1 = exposed_cases / exposed_total
p2 = control_cases / control_total
rr = p1 / p2
return RelativeRiskResult(relative_risk=rr,
exposed_cases=exposed_cases,
exposed_total=exposed_total,
control_cases=control_cases,
control_total=control_total)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,40 @@
# This module exists only to allow Sphinx to generate docs
# for the result objects returned by some functions in stats
# _without_ adding them to the main stats documentation page.
"""
Result classes
--------------
.. currentmodule:: scipy.stats._result_classes
.. autosummary::
:toctree: generated/
RelativeRiskResult
BinomTestResult
TukeyHSDResult
DunnettResult
PearsonRResult
FitResult
OddsRatioResult
TtestResult
ECDFResult
EmpiricalDistributionFunction
"""
__all__ = ['BinomTestResult', 'RelativeRiskResult', 'TukeyHSDResult',
'PearsonRResult', 'FitResult', 'OddsRatioResult',
'TtestResult', 'DunnettResult', 'ECDFResult',
'EmpiricalDistributionFunction']
from ._binomtest import BinomTestResult
from ._odds_ratio import OddsRatioResult
from ._relative_risk import RelativeRiskResult
from ._hypotests import TukeyHSDResult
from ._multicomp import DunnettResult
from ._stats_py import PearsonRResult, TtestResult
from ._fit import FitResult
from ._survival import ECDFResult, EmpiricalDistributionFunction

View File

@ -0,0 +1,56 @@
import warnings
from scipy.stats.sampling import RatioUniforms
def rvs_ratio_uniforms(pdf, umax, vmin, vmax, size=1, c=0, random_state=None):
"""
Generate random samples from a probability density function using the
ratio-of-uniforms method.
.. deprecated:: 1.12.0
`rvs_ratio_uniforms` is deprecated in favour of
`scipy.stats.sampling.RatioUniforms` from version 1.12.0 and will
be removed in SciPy 1.15.0
Parameters
----------
pdf : callable
A function with signature `pdf(x)` that is proportional to the
probability density function of the distribution.
umax : float
The upper bound of the bounding rectangle in the u-direction.
vmin : float
The lower bound of the bounding rectangle in the v-direction.
vmax : float
The upper bound of the bounding rectangle in the v-direction.
size : int or tuple of ints, optional
Defining number of random variates (default is 1).
c : float, optional.
Shift parameter of ratio-of-uniforms method, see Notes. Default is 0.
random_state : {None, int, `numpy.random.Generator`,
`numpy.random.RandomState`}, optional
If `seed` is None (or `np.random`), the `numpy.random.RandomState`
singleton is used.
If `seed` is an int, a new ``RandomState`` instance is used,
seeded with `seed`.
If `seed` is already a ``Generator`` or ``RandomState`` instance then
that instance is used.
Returns
-------
rvs : ndarray
The random variates distributed according to the probability
distribution defined by the pdf.
Notes
-----
Please refer to `scipy.stats.sampling.RatioUniforms` for the documentation.
"""
warnings.warn("Please use `RatioUniforms` from the "
"`scipy.stats.sampling` namespace. The "
"`scipy.stats.rvs_ratio_uniforms` namespace is deprecated "
"and will be removed in SciPy 1.15.0",
category=DeprecationWarning, stacklevel=2)
gen = RatioUniforms(pdf, umax=umax, vmin=vmin, vmax=vmax,
c=c, random_state=random_state)
return gen.rvs(size)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,712 @@
from __future__ import annotations
import inspect
from dataclasses import dataclass
from typing import (
Callable, Literal, Protocol, TYPE_CHECKING
)
import numpy as np
from scipy.stats._common import ConfidenceInterval
from scipy.stats._qmc import check_random_state
from scipy.stats._resampling import BootstrapResult
from scipy.stats import qmc, bootstrap
if TYPE_CHECKING:
import numpy.typing as npt
from scipy._lib._util import DecimalNumber, IntNumber, SeedType
__all__ = [
'sobol_indices'
]
def f_ishigami(x: npt.ArrayLike) -> np.ndarray:
r"""Ishigami function.
.. math::
Y(\mathbf{x}) = \sin x_1 + 7 \sin^2 x_2 + 0.1 x_3^4 \sin x_1
with :math:`\mathbf{x} \in [-\pi, \pi]^3`.
Parameters
----------
x : array_like ([x1, x2, x3], n)
Returns
-------
f : array_like (n,)
Function evaluation.
References
----------
.. [1] Ishigami, T. and T. Homma. "An importance quantification technique
in uncertainty analysis for computer models." IEEE,
:doi:`10.1109/ISUMA.1990.151285`, 1990.
"""
x = np.atleast_2d(x)
f_eval = (
np.sin(x[0])
+ 7 * np.sin(x[1])**2
+ 0.1 * (x[2]**4) * np.sin(x[0])
)
return f_eval
def sample_A_B(
n: IntNumber,
dists: list[PPFDist],
random_state: SeedType = None
) -> np.ndarray:
"""Sample two matrices A and B.
Uses a Sobol' sequence with 2`d` columns to have 2 uncorrelated matrices.
This is more efficient than using 2 random draw of Sobol'.
See sec. 5 from [1]_.
Output shape is (d, n).
References
----------
.. [1] Saltelli, A., P. Annoni, I. Azzini, F. Campolongo, M. Ratto, and
S. Tarantola. "Variance based sensitivity analysis of model
output. Design and estimator for the total sensitivity index."
Computer Physics Communications, 181(2):259-270,
:doi:`10.1016/j.cpc.2009.09.018`, 2010.
"""
d = len(dists)
A_B = qmc.Sobol(d=2*d, seed=random_state, bits=64).random(n).T
A_B = A_B.reshape(2, d, -1)
try:
for d_, dist in enumerate(dists):
A_B[:, d_] = dist.ppf(A_B[:, d_])
except AttributeError as exc:
message = "Each distribution in `dists` must have method `ppf`."
raise ValueError(message) from exc
return A_B
def sample_AB(A: np.ndarray, B: np.ndarray) -> np.ndarray:
"""AB matrix.
AB: rows of B into A. Shape (d, d, n).
- Copy A into d "pages"
- In the first page, replace 1st rows of A with 1st row of B.
...
- In the dth page, replace dth row of A with dth row of B.
- return the stack of pages
"""
d, n = A.shape
AB = np.tile(A, (d, 1, 1))
i = np.arange(d)
AB[i, i] = B[i]
return AB
def saltelli_2010(
f_A: np.ndarray, f_B: np.ndarray, f_AB: np.ndarray
) -> tuple[np.ndarray, np.ndarray]:
r"""Saltelli2010 formulation.
.. math::
S_i = \frac{1}{N} \sum_{j=1}^N
f(\mathbf{B})_j (f(\mathbf{AB}^{(i)})_j - f(\mathbf{A})_j)
.. math::
S_{T_i} = \frac{1}{N} \sum_{j=1}^N
(f(\mathbf{A})_j - f(\mathbf{AB}^{(i)})_j)^2
Parameters
----------
f_A, f_B : array_like (s, n)
Function values at A and B, respectively
f_AB : array_like (d, s, n)
Function values at each of the AB pages
Returns
-------
s, st : array_like (s, d)
First order and total order Sobol' indices.
References
----------
.. [1] Saltelli, A., P. Annoni, I. Azzini, F. Campolongo, M. Ratto, and
S. Tarantola. "Variance based sensitivity analysis of model
output. Design and estimator for the total sensitivity index."
Computer Physics Communications, 181(2):259-270,
:doi:`10.1016/j.cpc.2009.09.018`, 2010.
"""
# Empirical variance calculated using output from A and B which are
# independent. Output of AB is not independent and cannot be used
var = np.var([f_A, f_B], axis=(0, -1))
# We divide by the variance to have a ratio of variance
# this leads to eq. 2
s = np.mean(f_B * (f_AB - f_A), axis=-1) / var # Table 2 (b)
st = 0.5 * np.mean((f_A - f_AB) ** 2, axis=-1) / var # Table 2 (f)
return s.T, st.T
@dataclass
class BootstrapSobolResult:
first_order: BootstrapResult
total_order: BootstrapResult
@dataclass
class SobolResult:
first_order: np.ndarray
total_order: np.ndarray
_indices_method: Callable
_f_A: np.ndarray
_f_B: np.ndarray
_f_AB: np.ndarray
_A: np.ndarray | None = None
_B: np.ndarray | None = None
_AB: np.ndarray | None = None
_bootstrap_result: BootstrapResult | None = None
def bootstrap(
self,
confidence_level: DecimalNumber = 0.95,
n_resamples: IntNumber = 999
) -> BootstrapSobolResult:
"""Bootstrap Sobol' indices to provide confidence intervals.
Parameters
----------
confidence_level : float, default: ``0.95``
The confidence level of the confidence intervals.
n_resamples : int, default: ``999``
The number of resamples performed to form the bootstrap
distribution of the indices.
Returns
-------
res : BootstrapSobolResult
Bootstrap result containing the confidence intervals and the
bootstrap distribution of the indices.
An object with attributes:
first_order : BootstrapResult
Bootstrap result of the first order indices.
total_order : BootstrapResult
Bootstrap result of the total order indices.
See `BootstrapResult` for more details.
"""
def statistic(idx):
f_A_ = self._f_A[:, idx]
f_B_ = self._f_B[:, idx]
f_AB_ = self._f_AB[..., idx]
return self._indices_method(f_A_, f_B_, f_AB_)
n = self._f_A.shape[1]
res = bootstrap(
[np.arange(n)], statistic=statistic, method="BCa",
n_resamples=n_resamples,
confidence_level=confidence_level,
bootstrap_result=self._bootstrap_result
)
self._bootstrap_result = res
first_order = BootstrapResult(
confidence_interval=ConfidenceInterval(
res.confidence_interval.low[0], res.confidence_interval.high[0]
),
bootstrap_distribution=res.bootstrap_distribution[0],
standard_error=res.standard_error[0],
)
total_order = BootstrapResult(
confidence_interval=ConfidenceInterval(
res.confidence_interval.low[1], res.confidence_interval.high[1]
),
bootstrap_distribution=res.bootstrap_distribution[1],
standard_error=res.standard_error[1],
)
return BootstrapSobolResult(
first_order=first_order, total_order=total_order
)
class PPFDist(Protocol):
@property
def ppf(self) -> Callable[..., float]:
...
def sobol_indices(
*,
func: Callable[[np.ndarray], npt.ArrayLike] |
dict[Literal['f_A', 'f_B', 'f_AB'], np.ndarray],
n: IntNumber,
dists: list[PPFDist] | None = None,
method: Callable | Literal['saltelli_2010'] = 'saltelli_2010',
random_state: SeedType = None
) -> SobolResult:
r"""Global sensitivity indices of Sobol'.
Parameters
----------
func : callable or dict(str, array_like)
If `func` is a callable, function to compute the Sobol' indices from.
Its signature must be::
func(x: ArrayLike) -> ArrayLike
with ``x`` of shape ``(d, n)`` and output of shape ``(s, n)`` where:
- ``d`` is the input dimensionality of `func`
(number of input variables),
- ``s`` is the output dimensionality of `func`
(number of output variables), and
- ``n`` is the number of samples (see `n` below).
Function evaluation values must be finite.
If `func` is a dictionary, contains the function evaluations from three
different arrays. Keys must be: ``f_A``, ``f_B`` and ``f_AB``.
``f_A`` and ``f_B`` should have a shape ``(s, n)`` and ``f_AB``
should have a shape ``(d, s, n)``.
This is an advanced feature and misuse can lead to wrong analysis.
n : int
Number of samples used to generate the matrices ``A`` and ``B``.
Must be a power of 2. The total number of points at which `func` is
evaluated will be ``n*(d+2)``.
dists : list(distributions), optional
List of each parameter's distribution. The distribution of parameters
depends on the application and should be carefully chosen.
Parameters are assumed to be independently distributed, meaning there
is no constraint nor relationship between their values.
Distributions must be an instance of a class with a ``ppf``
method.
Must be specified if `func` is a callable, and ignored otherwise.
method : Callable or str, default: 'saltelli_2010'
Method used to compute the first and total Sobol' indices.
If a callable, its signature must be::
func(f_A: np.ndarray, f_B: np.ndarray, f_AB: np.ndarray)
-> Tuple[np.ndarray, np.ndarray]
with ``f_A, f_B`` of shape ``(s, n)`` and ``f_AB`` of shape
``(d, s, n)``.
These arrays contain the function evaluations from three different sets
of samples.
The output is a tuple of the first and total indices with
shape ``(s, d)``.
This is an advanced feature and misuse can lead to wrong analysis.
random_state : {None, int, `numpy.random.Generator`}, optional
If `random_state` is an int or None, a new `numpy.random.Generator` is
created using ``np.random.default_rng(random_state)``.
If `random_state` is already a ``Generator`` instance, then the
provided instance is used.
Returns
-------
res : SobolResult
An object with attributes:
first_order : ndarray of shape (s, d)
First order Sobol' indices.
total_order : ndarray of shape (s, d)
Total order Sobol' indices.
And method:
bootstrap(confidence_level: float, n_resamples: int)
-> BootstrapSobolResult
A method providing confidence intervals on the indices.
See `scipy.stats.bootstrap` for more details.
The bootstrapping is done on both first and total order indices,
and they are available in `BootstrapSobolResult` as attributes
``first_order`` and ``total_order``.
Notes
-----
The Sobol' method [1]_, [2]_ is a variance-based Sensitivity Analysis which
obtains the contribution of each parameter to the variance of the
quantities of interest (QoIs; i.e., the outputs of `func`).
Respective contributions can be used to rank the parameters and
also gauge the complexity of the model by computing the
model's effective (or mean) dimension.
.. note::
Parameters are assumed to be independently distributed. Each
parameter can still follow any distribution. In fact, the distribution
is very important and should match the real distribution of the
parameters.
It uses a functional decomposition of the variance of the function to
explore
.. math::
\mathbb{V}(Y) = \sum_{i}^{d} \mathbb{V}_i (Y) + \sum_{i<j}^{d}
\mathbb{V}_{ij}(Y) + ... + \mathbb{V}_{1,2,...,d}(Y),
introducing conditional variances:
.. math::
\mathbb{V}_i(Y) = \mathbb{\mathbb{V}}[\mathbb{E}(Y|x_i)]
\qquad
\mathbb{V}_{ij}(Y) = \mathbb{\mathbb{V}}[\mathbb{E}(Y|x_i x_j)]
- \mathbb{V}_i(Y) - \mathbb{V}_j(Y),
Sobol' indices are expressed as
.. math::
S_i = \frac{\mathbb{V}_i(Y)}{\mathbb{V}[Y]}
\qquad
S_{ij} =\frac{\mathbb{V}_{ij}(Y)}{\mathbb{V}[Y]}.
:math:`S_{i}` corresponds to the first-order term which apprises the
contribution of the i-th parameter, while :math:`S_{ij}` corresponds to the
second-order term which informs about the contribution of interactions
between the i-th and the j-th parameters. These equations can be
generalized to compute higher order terms; however, they are expensive to
compute and their interpretation is complex.
This is why only first order indices are provided.
Total order indices represent the global contribution of the parameters
to the variance of the QoI and are defined as:
.. math::
S_{T_i} = S_i + \sum_j S_{ij} + \sum_{j,k} S_{ijk} + ...
= 1 - \frac{\mathbb{V}[\mathbb{E}(Y|x_{\sim i})]}{\mathbb{V}[Y]}.
First order indices sum to at most 1, while total order indices sum to at
least 1. If there are no interactions, then first and total order indices
are equal, and both first and total order indices sum to 1.
.. warning::
Negative Sobol' values are due to numerical errors. Increasing the
number of points `n` should help.
The number of sample required to have a good analysis increases with
the dimensionality of the problem. e.g. for a 3 dimension problem,
consider at minima ``n >= 2**12``. The more complex the model is,
the more samples will be needed.
Even for a purely addiditive model, the indices may not sum to 1 due
to numerical noise.
References
----------
.. [1] Sobol, I. M.. "Sensitivity analysis for nonlinear mathematical
models." Mathematical Modeling and Computational Experiment, 1:407-414,
1993.
.. [2] Sobol, I. M. (2001). "Global sensitivity indices for nonlinear
mathematical models and their Monte Carlo estimates." Mathematics
and Computers in Simulation, 55(1-3):271-280,
:doi:`10.1016/S0378-4754(00)00270-6`, 2001.
.. [3] Saltelli, A. "Making best use of model evaluations to
compute sensitivity indices." Computer Physics Communications,
145(2):280-297, :doi:`10.1016/S0010-4655(02)00280-1`, 2002.
.. [4] Saltelli, A., M. Ratto, T. Andres, F. Campolongo, J. Cariboni,
D. Gatelli, M. Saisana, and S. Tarantola. "Global Sensitivity Analysis.
The Primer." 2007.
.. [5] Saltelli, A., P. Annoni, I. Azzini, F. Campolongo, M. Ratto, and
S. Tarantola. "Variance based sensitivity analysis of model
output. Design and estimator for the total sensitivity index."
Computer Physics Communications, 181(2):259-270,
:doi:`10.1016/j.cpc.2009.09.018`, 2010.
.. [6] Ishigami, T. and T. Homma. "An importance quantification technique
in uncertainty analysis for computer models." IEEE,
:doi:`10.1109/ISUMA.1990.151285`, 1990.
Examples
--------
The following is an example with the Ishigami function [6]_
.. math::
Y(\mathbf{x}) = \sin x_1 + 7 \sin^2 x_2 + 0.1 x_3^4 \sin x_1,
with :math:`\mathbf{x} \in [-\pi, \pi]^3`. This function exhibits strong
non-linearity and non-monotonicity.
Remember, Sobol' indices assumes that samples are independently
distributed. In this case we use a uniform distribution on each marginals.
>>> import numpy as np
>>> from scipy.stats import sobol_indices, uniform
>>> rng = np.random.default_rng()
>>> def f_ishigami(x):
... f_eval = (
... np.sin(x[0])
... + 7 * np.sin(x[1])**2
... + 0.1 * (x[2]**4) * np.sin(x[0])
... )
... return f_eval
>>> indices = sobol_indices(
... func=f_ishigami, n=1024,
... dists=[
... uniform(loc=-np.pi, scale=2*np.pi),
... uniform(loc=-np.pi, scale=2*np.pi),
... uniform(loc=-np.pi, scale=2*np.pi)
... ],
... random_state=rng
... )
>>> indices.first_order
array([0.31637954, 0.43781162, 0.00318825])
>>> indices.total_order
array([0.56122127, 0.44287857, 0.24229595])
Confidence interval can be obtained using bootstrapping.
>>> boot = indices.bootstrap()
Then, this information can be easily visualized.
>>> import matplotlib.pyplot as plt
>>> fig, axs = plt.subplots(1, 2, figsize=(9, 4))
>>> _ = axs[0].errorbar(
... [1, 2, 3], indices.first_order, fmt='o',
... yerr=[
... indices.first_order - boot.first_order.confidence_interval.low,
... boot.first_order.confidence_interval.high - indices.first_order
... ],
... )
>>> axs[0].set_ylabel("First order Sobol' indices")
>>> axs[0].set_xlabel('Input parameters')
>>> axs[0].set_xticks([1, 2, 3])
>>> _ = axs[1].errorbar(
... [1, 2, 3], indices.total_order, fmt='o',
... yerr=[
... indices.total_order - boot.total_order.confidence_interval.low,
... boot.total_order.confidence_interval.high - indices.total_order
... ],
... )
>>> axs[1].set_ylabel("Total order Sobol' indices")
>>> axs[1].set_xlabel('Input parameters')
>>> axs[1].set_xticks([1, 2, 3])
>>> plt.tight_layout()
>>> plt.show()
.. note::
By default, `scipy.stats.uniform` has support ``[0, 1]``.
Using the parameters ``loc`` and ``scale``, one obtains the uniform
distribution on ``[loc, loc + scale]``.
This result is particularly interesting because the first order index
:math:`S_{x_3} = 0` whereas its total order is :math:`S_{T_{x_3}} = 0.244`.
This means that higher order interactions with :math:`x_3` are responsible
for the difference. Almost 25% of the observed variance
on the QoI is due to the correlations between :math:`x_3` and :math:`x_1`,
although :math:`x_3` by itself has no impact on the QoI.
The following gives a visual explanation of Sobol' indices on this
function. Let's generate 1024 samples in :math:`[-\pi, \pi]^3` and
calculate the value of the output.
>>> from scipy.stats import qmc
>>> n_dim = 3
>>> p_labels = ['$x_1$', '$x_2$', '$x_3$']
>>> sample = qmc.Sobol(d=n_dim, seed=rng).random(1024)
>>> sample = qmc.scale(
... sample=sample,
... l_bounds=[-np.pi, -np.pi, -np.pi],
... u_bounds=[np.pi, np.pi, np.pi]
... )
>>> output = f_ishigami(sample.T)
Now we can do scatter plots of the output with respect to each parameter.
This gives a visual way to understand how each parameter impacts the
output of the function.
>>> fig, ax = plt.subplots(1, n_dim, figsize=(12, 4))
>>> for i in range(n_dim):
... xi = sample[:, i]
... ax[i].scatter(xi, output, marker='+')
... ax[i].set_xlabel(p_labels[i])
>>> ax[0].set_ylabel('Y')
>>> plt.tight_layout()
>>> plt.show()
Now Sobol' goes a step further:
by conditioning the output value by given values of the parameter
(black lines), the conditional output mean is computed. It corresponds to
the term :math:`\mathbb{E}(Y|x_i)`. Taking the variance of this term gives
the numerator of the Sobol' indices.
>>> mini = np.min(output)
>>> maxi = np.max(output)
>>> n_bins = 10
>>> bins = np.linspace(-np.pi, np.pi, num=n_bins, endpoint=False)
>>> dx = bins[1] - bins[0]
>>> fig, ax = plt.subplots(1, n_dim, figsize=(12, 4))
>>> for i in range(n_dim):
... xi = sample[:, i]
... ax[i].scatter(xi, output, marker='+')
... ax[i].set_xlabel(p_labels[i])
... for bin_ in bins:
... idx = np.where((bin_ <= xi) & (xi <= bin_ + dx))
... xi_ = xi[idx]
... y_ = output[idx]
... ave_y_ = np.mean(y_)
... ax[i].plot([bin_ + dx/2] * 2, [mini, maxi], c='k')
... ax[i].scatter(bin_ + dx/2, ave_y_, c='r')
>>> ax[0].set_ylabel('Y')
>>> plt.tight_layout()
>>> plt.show()
Looking at :math:`x_3`, the variance
of the mean is zero leading to :math:`S_{x_3} = 0`. But we can further
observe that the variance of the output is not constant along the parameter
values of :math:`x_3`. This heteroscedasticity is explained by higher order
interactions. Moreover, an heteroscedasticity is also noticeable on
:math:`x_1` leading to an interaction between :math:`x_3` and :math:`x_1`.
On :math:`x_2`, the variance seems to be constant and thus null interaction
with this parameter can be supposed.
This case is fairly simple to analyse visually---although it is only a
qualitative analysis. Nevertheless, when the number of input parameters
increases such analysis becomes unrealistic as it would be difficult to
conclude on high-order terms. Hence the benefit of using Sobol' indices.
"""
random_state = check_random_state(random_state)
n_ = int(n)
if not (n_ & (n_ - 1) == 0) or n != n_:
raise ValueError(
"The balance properties of Sobol' points require 'n' "
"to be a power of 2."
)
n = n_
if not callable(method):
indices_methods: dict[str, Callable] = {
"saltelli_2010": saltelli_2010,
}
try:
method = method.lower() # type: ignore[assignment]
indices_method_ = indices_methods[method]
except KeyError as exc:
message = (
f"{method!r} is not a valid 'method'. It must be one of"
f" {set(indices_methods)!r} or a callable."
)
raise ValueError(message) from exc
else:
indices_method_ = method
sig = inspect.signature(indices_method_)
if set(sig.parameters) != {'f_A', 'f_B', 'f_AB'}:
message = (
"If 'method' is a callable, it must have the following"
f" signature: {inspect.signature(saltelli_2010)}"
)
raise ValueError(message)
def indices_method(f_A, f_B, f_AB):
"""Wrap indices method to ensure proper output dimension.
1D when single output, 2D otherwise.
"""
return np.squeeze(indices_method_(f_A=f_A, f_B=f_B, f_AB=f_AB))
if callable(func):
if dists is None:
raise ValueError(
"'dists' must be defined when 'func' is a callable."
)
def wrapped_func(x):
return np.atleast_2d(func(x))
A, B = sample_A_B(n=n, dists=dists, random_state=random_state)
AB = sample_AB(A=A, B=B)
f_A = wrapped_func(A)
if f_A.shape[1] != n:
raise ValueError(
"'func' output should have a shape ``(s, -1)`` with ``s`` "
"the number of output."
)
def funcAB(AB):
d, d, n = AB.shape
AB = np.moveaxis(AB, 0, -1).reshape(d, n*d)
f_AB = wrapped_func(AB)
return np.moveaxis(f_AB.reshape((-1, n, d)), -1, 0)
f_B = wrapped_func(B)
f_AB = funcAB(AB)
else:
message = (
"When 'func' is a dictionary, it must contain the following "
"keys: 'f_A', 'f_B' and 'f_AB'."
"'f_A' and 'f_B' should have a shape ``(s, n)`` and 'f_AB' "
"should have a shape ``(d, s, n)``."
)
try:
f_A, f_B, f_AB = np.atleast_2d(
func['f_A'], func['f_B'], func['f_AB']
)
except KeyError as exc:
raise ValueError(message) from exc
if f_A.shape[1] != n or f_A.shape != f_B.shape or \
f_AB.shape == f_A.shape or f_AB.shape[-1] % n != 0:
raise ValueError(message)
# Normalization by mean
# Sobol', I. and Levitan, Y. L. (1999). On the use of variance reducing
# multipliers in monte carlo computations of a global sensitivity index.
# Computer Physics Communications, 117(1) :52-61.
mean = np.mean([f_A, f_B], axis=(0, -1)).reshape(-1, 1)
f_A -= mean
f_B -= mean
f_AB -= mean
# Compute indices
# Filter warnings for constant output as var = 0
with np.errstate(divide='ignore', invalid='ignore'):
first_order, total_order = indices_method(f_A=f_A, f_B=f_B, f_AB=f_AB)
# null variance means null indices
first_order[~np.isfinite(first_order)] = 0
total_order[~np.isfinite(total_order)] = 0
res = dict(
first_order=first_order,
total_order=total_order,
_indices_method=indices_method,
_f_A=f_A,
_f_B=f_B,
_f_AB=f_AB
)
if callable(func):
res.update(
dict(
_A=A,
_B=B,
_AB=AB,
)
)
return SobolResult(**res)

View File

@ -0,0 +1,54 @@
import numpy as np
from scipy._lib._util import IntNumber
from typing import Literal
def _initialize_v(
v : np.ndarray,
dim : IntNumber,
bits: IntNumber
) -> None: ...
def _cscramble (
dim : IntNumber,
bits: IntNumber,
ltm : np.ndarray,
sv: np.ndarray
) -> None: ...
def _fill_p_cumulative(
p: np.ndarray,
p_cumulative: np.ndarray
) -> None: ...
def _draw(
n : IntNumber,
num_gen: IntNumber,
dim: IntNumber,
scale: float,
sv: np.ndarray,
quasi: np.ndarray,
sample: np.ndarray
) -> None: ...
def _fast_forward(
n: IntNumber,
num_gen: IntNumber,
dim: IntNumber,
sv: np.ndarray,
quasi: np.ndarray
) -> None: ...
def _categorize(
draws: np.ndarray,
p_cumulative: np.ndarray,
result: np.ndarray
) -> None: ...
_MAXDIM: Literal[21201]
_MAXDEG: Literal[18]
def _test_find_index(
p_cumulative: np.ndarray,
size: int,
value: float
) -> int: ...

View File

@ -0,0 +1,10 @@
# destined to be used in a LowLevelCallable
cdef double _geninvgauss_pdf(double x, void *user_data) noexcept nogil
cdef double _studentized_range_cdf(int n, double[2] x, void *user_data) noexcept nogil
cdef double _studentized_range_cdf_asymptotic(double z, void *user_data) noexcept nogil
cdef double _studentized_range_pdf(int n, double[2] x, void *user_data) noexcept nogil
cdef double _studentized_range_pdf_asymptotic(double z, void *user_data) noexcept nogil
cdef double _studentized_range_moment(int n, double[3] x_arg, void *user_data) noexcept nogil
cdef double _genhyperbolic_pdf(double x, void *user_data) noexcept nogil
cdef double _genhyperbolic_logpdf(double x, void *user_data) noexcept nogil

View File

@ -0,0 +1,303 @@
import warnings
import numpy as np
from . import distributions
from .._lib._bunch import _make_tuple_bunch
from ._stats_pythran import siegelslopes as siegelslopes_pythran
__all__ = ['_find_repeats', 'theilslopes', 'siegelslopes']
# This is not a namedtuple for backwards compatibility. See PR #12983
TheilslopesResult = _make_tuple_bunch('TheilslopesResult',
['slope', 'intercept',
'low_slope', 'high_slope'])
SiegelslopesResult = _make_tuple_bunch('SiegelslopesResult',
['slope', 'intercept'])
def theilslopes(y, x=None, alpha=0.95, method='separate'):
r"""
Computes the Theil-Sen estimator for a set of points (x, y).
`theilslopes` implements a method for robust linear regression. It
computes the slope as the median of all slopes between paired values.
Parameters
----------
y : array_like
Dependent variable.
x : array_like or None, optional
Independent variable. If None, use ``arange(len(y))`` instead.
alpha : float, optional
Confidence degree between 0 and 1. Default is 95% confidence.
Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are
interpreted as "find the 90% confidence interval".
method : {'joint', 'separate'}, optional
Method to be used for computing estimate for intercept.
Following methods are supported,
* 'joint': Uses np.median(y - slope * x) as intercept.
* 'separate': Uses np.median(y) - slope * np.median(x)
as intercept.
The default is 'separate'.
.. versionadded:: 1.8.0
Returns
-------
result : ``TheilslopesResult`` instance
The return value is an object with the following attributes:
slope : float
Theil slope.
intercept : float
Intercept of the Theil line.
low_slope : float
Lower bound of the confidence interval on `slope`.
high_slope : float
Upper bound of the confidence interval on `slope`.
See Also
--------
siegelslopes : a similar technique using repeated medians
Notes
-----
The implementation of `theilslopes` follows [1]_. The intercept is
not defined in [1]_, and here it is defined as ``median(y) -
slope*median(x)``, which is given in [3]_. Other definitions of
the intercept exist in the literature such as ``median(y - slope*x)``
in [4]_. The approach to compute the intercept can be determined by the
parameter ``method``. A confidence interval for the intercept is not
given as this question is not addressed in [1]_.
For compatibility with older versions of SciPy, the return value acts
like a ``namedtuple`` of length 4, with fields ``slope``, ``intercept``,
``low_slope``, and ``high_slope``, so one can continue to write::
slope, intercept, low_slope, high_slope = theilslopes(y, x)
References
----------
.. [1] P.K. Sen, "Estimates of the regression coefficient based on
Kendall's tau", J. Am. Stat. Assoc., Vol. 63, pp. 1379-1389, 1968.
.. [2] H. Theil, "A rank-invariant method of linear and polynomial
regression analysis I, II and III", Nederl. Akad. Wetensch., Proc.
53:, pp. 386-392, pp. 521-525, pp. 1397-1412, 1950.
.. [3] W.L. Conover, "Practical nonparametric statistics", 2nd ed.,
John Wiley and Sons, New York, pp. 493.
.. [4] https://en.wikipedia.org/wiki/Theil%E2%80%93Sen_estimator
Examples
--------
>>> import numpy as np
>>> from scipy import stats
>>> import matplotlib.pyplot as plt
>>> x = np.linspace(-5, 5, num=150)
>>> y = x + np.random.normal(size=x.size)
>>> y[11:15] += 10 # add outliers
>>> y[-5:] -= 7
Compute the slope, intercept and 90% confidence interval. For comparison,
also compute the least-squares fit with `linregress`:
>>> res = stats.theilslopes(y, x, 0.90, method='separate')
>>> lsq_res = stats.linregress(x, y)
Plot the results. The Theil-Sen regression line is shown in red, with the
dashed red lines illustrating the confidence interval of the slope (note
that the dashed red lines are not the confidence interval of the regression
as the confidence interval of the intercept is not included). The green
line shows the least-squares fit for comparison.
>>> fig = plt.figure()
>>> ax = fig.add_subplot(111)
>>> ax.plot(x, y, 'b.')
>>> ax.plot(x, res[1] + res[0] * x, 'r-')
>>> ax.plot(x, res[1] + res[2] * x, 'r--')
>>> ax.plot(x, res[1] + res[3] * x, 'r--')
>>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
>>> plt.show()
"""
if method not in ['joint', 'separate']:
raise ValueError("method must be either 'joint' or 'separate'."
f"'{method}' is invalid.")
# We copy both x and y so we can use _find_repeats.
y = np.array(y, dtype=float, copy=True).ravel()
if x is None:
x = np.arange(len(y), dtype=float)
else:
x = np.array(x, dtype=float, copy=True).ravel()
if len(x) != len(y):
raise ValueError(f"Incompatible lengths ! ({len(y)}<>{len(x)})")
# Compute sorted slopes only when deltax > 0
deltax = x[:, np.newaxis] - x
deltay = y[:, np.newaxis] - y
slopes = deltay[deltax > 0] / deltax[deltax > 0]
if not slopes.size:
msg = "All `x` coordinates are identical."
warnings.warn(msg, RuntimeWarning, stacklevel=2)
slopes.sort()
medslope = np.median(slopes)
if method == 'joint':
medinter = np.median(y - medslope * x)
else:
medinter = np.median(y) - medslope * np.median(x)
# Now compute confidence intervals
if alpha > 0.5:
alpha = 1. - alpha
z = distributions.norm.ppf(alpha / 2.)
# This implements (2.6) from Sen (1968)
_, nxreps = _find_repeats(x)
_, nyreps = _find_repeats(y)
nt = len(slopes) # N in Sen (1968)
ny = len(y) # n in Sen (1968)
# Equation 2.6 in Sen (1968):
sigsq = 1/18. * (ny * (ny-1) * (2*ny+5) -
sum(k * (k-1) * (2*k + 5) for k in nxreps) -
sum(k * (k-1) * (2*k + 5) for k in nyreps))
# Find the confidence interval indices in `slopes`
try:
sigma = np.sqrt(sigsq)
Ru = min(int(np.round((nt - z*sigma)/2.)), len(slopes)-1)
Rl = max(int(np.round((nt + z*sigma)/2.)) - 1, 0)
delta = slopes[[Rl, Ru]]
except (ValueError, IndexError):
delta = (np.nan, np.nan)
return TheilslopesResult(slope=medslope, intercept=medinter,
low_slope=delta[0], high_slope=delta[1])
def _find_repeats(arr):
# This function assumes it may clobber its input.
if len(arr) == 0:
return np.array(0, np.float64), np.array(0, np.intp)
# XXX This cast was previously needed for the Fortran implementation,
# should we ditch it?
arr = np.asarray(arr, np.float64).ravel()
arr.sort()
# Taken from NumPy 1.9's np.unique.
change = np.concatenate(([True], arr[1:] != arr[:-1]))
unique = arr[change]
change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))
freq = np.diff(change_idx)
atleast2 = freq > 1
return unique[atleast2], freq[atleast2]
def siegelslopes(y, x=None, method="hierarchical"):
r"""
Computes the Siegel estimator for a set of points (x, y).
`siegelslopes` implements a method for robust linear regression
using repeated medians (see [1]_) to fit a line to the points (x, y).
The method is robust to outliers with an asymptotic breakdown point
of 50%.
Parameters
----------
y : array_like
Dependent variable.
x : array_like or None, optional
Independent variable. If None, use ``arange(len(y))`` instead.
method : {'hierarchical', 'separate'}
If 'hierarchical', estimate the intercept using the estimated
slope ``slope`` (default option).
If 'separate', estimate the intercept independent of the estimated
slope. See Notes for details.
Returns
-------
result : ``SiegelslopesResult`` instance
The return value is an object with the following attributes:
slope : float
Estimate of the slope of the regression line.
intercept : float
Estimate of the intercept of the regression line.
See Also
--------
theilslopes : a similar technique without repeated medians
Notes
-----
With ``n = len(y)``, compute ``m_j`` as the median of
the slopes from the point ``(x[j], y[j])`` to all other `n-1` points.
``slope`` is then the median of all slopes ``m_j``.
Two ways are given to estimate the intercept in [1]_ which can be chosen
via the parameter ``method``.
The hierarchical approach uses the estimated slope ``slope``
and computes ``intercept`` as the median of ``y - slope*x``.
The other approach estimates the intercept separately as follows: for
each point ``(x[j], y[j])``, compute the intercepts of all the `n-1`
lines through the remaining points and take the median ``i_j``.
``intercept`` is the median of the ``i_j``.
The implementation computes `n` times the median of a vector of size `n`
which can be slow for large vectors. There are more efficient algorithms
(see [2]_) which are not implemented here.
For compatibility with older versions of SciPy, the return value acts
like a ``namedtuple`` of length 2, with fields ``slope`` and
``intercept``, so one can continue to write::
slope, intercept = siegelslopes(y, x)
References
----------
.. [1] A. Siegel, "Robust Regression Using Repeated Medians",
Biometrika, Vol. 69, pp. 242-244, 1982.
.. [2] A. Stein and M. Werman, "Finding the repeated median regression
line", Proceedings of the Third Annual ACM-SIAM Symposium on
Discrete Algorithms, pp. 409-413, 1992.
Examples
--------
>>> import numpy as np
>>> from scipy import stats
>>> import matplotlib.pyplot as plt
>>> x = np.linspace(-5, 5, num=150)
>>> y = x + np.random.normal(size=x.size)
>>> y[11:15] += 10 # add outliers
>>> y[-5:] -= 7
Compute the slope and intercept. For comparison, also compute the
least-squares fit with `linregress`:
>>> res = stats.siegelslopes(y, x)
>>> lsq_res = stats.linregress(x, y)
Plot the results. The Siegel regression line is shown in red. The green
line shows the least-squares fit for comparison.
>>> fig = plt.figure()
>>> ax = fig.add_subplot(111)
>>> ax.plot(x, y, 'b.')
>>> ax.plot(x, res[1] + res[0] * x, 'r-')
>>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
>>> plt.show()
"""
if method not in ['hierarchical', 'separate']:
raise ValueError("method can only be 'hierarchical' or 'separate'")
y = np.asarray(y).ravel()
if x is None:
x = np.arange(len(y), dtype=float)
else:
x = np.asarray(x, dtype=float).ravel()
if len(x) != len(y):
raise ValueError(f"Incompatible lengths ! ({len(y)}<>{len(x)})")
dtype = np.result_type(x, y, np.float32) # use at least float32
y, x = y.astype(dtype), x.astype(dtype)
medslope, medinter = siegelslopes_pythran(y, x, method)
return SiegelslopesResult(slope=medslope, intercept=medinter)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,686 @@
from __future__ import annotations
from dataclasses import dataclass, field
from typing import TYPE_CHECKING
import warnings
import numpy as np
from scipy import special, interpolate, stats
from scipy.stats._censored_data import CensoredData
from scipy.stats._common import ConfidenceInterval
if TYPE_CHECKING:
from typing import Literal
import numpy.typing as npt
__all__ = ['ecdf', 'logrank']
@dataclass
class EmpiricalDistributionFunction:
"""An empirical distribution function produced by `scipy.stats.ecdf`
Attributes
----------
quantiles : ndarray
The unique values of the sample from which the
`EmpiricalDistributionFunction` was estimated.
probabilities : ndarray
The point estimates of the cumulative distribution function (CDF) or
its complement, the survival function (SF), corresponding with
`quantiles`.
"""
quantiles: np.ndarray
probabilities: np.ndarray
# Exclude these from __str__
_n: np.ndarray = field(repr=False) # number "at risk"
_d: np.ndarray = field(repr=False) # number of "deaths"
_sf: np.ndarray = field(repr=False) # survival function for var estimate
_kind: str = field(repr=False) # type of function: "cdf" or "sf"
def __init__(self, q, p, n, d, kind):
self.probabilities = p
self.quantiles = q
self._n = n
self._d = d
self._sf = p if kind == 'sf' else 1 - p
self._kind = kind
f0 = 1 if kind == 'sf' else 0 # leftmost function value
f1 = 1 - f0
# fill_value can't handle edge cases at infinity
x = np.insert(q, [0, len(q)], [-np.inf, np.inf])
y = np.insert(p, [0, len(p)], [f0, f1])
# `or` conditions handle the case of empty x, points
self._f = interpolate.interp1d(x, y, kind='previous',
assume_sorted=True)
def evaluate(self, x):
"""Evaluate the empirical CDF/SF function at the input.
Parameters
----------
x : ndarray
Argument to the CDF/SF
Returns
-------
y : ndarray
The CDF/SF evaluated at the input
"""
return self._f(x)
def plot(self, ax=None, **matplotlib_kwargs):
"""Plot the empirical distribution function
Available only if ``matplotlib`` is installed.
Parameters
----------
ax : matplotlib.axes.Axes
Axes object to draw the plot onto, otherwise uses the current Axes.
**matplotlib_kwargs : dict, optional
Keyword arguments passed directly to `matplotlib.axes.Axes.step`.
Unless overridden, ``where='post'``.
Returns
-------
lines : list of `matplotlib.lines.Line2D`
Objects representing the plotted data
"""
try:
import matplotlib # noqa: F401
except ModuleNotFoundError as exc:
message = "matplotlib must be installed to use method `plot`."
raise ModuleNotFoundError(message) from exc
if ax is None:
import matplotlib.pyplot as plt
ax = plt.gca()
kwargs = {'where': 'post'}
kwargs.update(matplotlib_kwargs)
delta = np.ptp(self.quantiles)*0.05 # how far past sample edge to plot
q = self.quantiles
q = [q[0] - delta] + list(q) + [q[-1] + delta]
return ax.step(q, self.evaluate(q), **kwargs)
def confidence_interval(self, confidence_level=0.95, *, method='linear'):
"""Compute a confidence interval around the CDF/SF point estimate
Parameters
----------
confidence_level : float, default: 0.95
Confidence level for the computed confidence interval
method : str, {"linear", "log-log"}
Method used to compute the confidence interval. Options are
"linear" for the conventional Greenwood confidence interval
(default) and "log-log" for the "exponential Greenwood",
log-negative-log-transformed confidence interval.
Returns
-------
ci : ``ConfidenceInterval``
An object with attributes ``low`` and ``high``, instances of
`~scipy.stats._result_classes.EmpiricalDistributionFunction` that
represent the lower and upper bounds (respectively) of the
confidence interval.
Notes
-----
Confidence intervals are computed according to the Greenwood formula
(``method='linear'``) or the more recent "exponential Greenwood"
formula (``method='log-log'``) as described in [1]_. The conventional
Greenwood formula can result in lower confidence limits less than 0
and upper confidence limits greater than 1; these are clipped to the
unit interval. NaNs may be produced by either method; these are
features of the formulas.
References
----------
.. [1] Sawyer, Stanley. "The Greenwood and Exponential Greenwood
Confidence Intervals in Survival Analysis."
https://www.math.wustl.edu/~sawyer/handouts/greenwood.pdf
"""
message = ("Confidence interval bounds do not implement a "
"`confidence_interval` method.")
if self._n is None:
raise NotImplementedError(message)
methods = {'linear': self._linear_ci,
'log-log': self._loglog_ci}
message = f"`method` must be one of {set(methods)}."
if method.lower() not in methods:
raise ValueError(message)
message = "`confidence_level` must be a scalar between 0 and 1."
confidence_level = np.asarray(confidence_level)[()]
if confidence_level.shape or not (0 <= confidence_level <= 1):
raise ValueError(message)
method_fun = methods[method.lower()]
low, high = method_fun(confidence_level)
message = ("The confidence interval is undefined at some observations."
" This is a feature of the mathematical formula used, not"
" an error in its implementation.")
if np.any(np.isnan(low) | np.isnan(high)):
warnings.warn(message, RuntimeWarning, stacklevel=2)
low, high = np.clip(low, 0, 1), np.clip(high, 0, 1)
low = EmpiricalDistributionFunction(self.quantiles, low, None, None,
self._kind)
high = EmpiricalDistributionFunction(self.quantiles, high, None, None,
self._kind)
return ConfidenceInterval(low, high)
def _linear_ci(self, confidence_level):
sf, d, n = self._sf, self._d, self._n
# When n == d, Greenwood's formula divides by zero.
# When s != 0, this can be ignored: var == inf, and CI is [0, 1]
# When s == 0, this results in NaNs. Produce an informative warning.
with np.errstate(divide='ignore', invalid='ignore'):
var = sf ** 2 * np.cumsum(d / (n * (n - d)))
se = np.sqrt(var)
z = special.ndtri(1 / 2 + confidence_level / 2)
z_se = z * se
low = self.probabilities - z_se
high = self.probabilities + z_se
return low, high
def _loglog_ci(self, confidence_level):
sf, d, n = self._sf, self._d, self._n
with np.errstate(divide='ignore', invalid='ignore'):
var = 1 / np.log(sf) ** 2 * np.cumsum(d / (n * (n - d)))
se = np.sqrt(var)
z = special.ndtri(1 / 2 + confidence_level / 2)
with np.errstate(divide='ignore'):
lnl_points = np.log(-np.log(sf))
z_se = z * se
low = np.exp(-np.exp(lnl_points + z_se))
high = np.exp(-np.exp(lnl_points - z_se))
if self._kind == "cdf":
low, high = 1-high, 1-low
return low, high
@dataclass
class ECDFResult:
""" Result object returned by `scipy.stats.ecdf`
Attributes
----------
cdf : `~scipy.stats._result_classes.EmpiricalDistributionFunction`
An object representing the empirical cumulative distribution function.
sf : `~scipy.stats._result_classes.EmpiricalDistributionFunction`
An object representing the complement of the empirical cumulative
distribution function.
"""
cdf: EmpiricalDistributionFunction
sf: EmpiricalDistributionFunction
def __init__(self, q, cdf, sf, n, d):
self.cdf = EmpiricalDistributionFunction(q, cdf, n, d, "cdf")
self.sf = EmpiricalDistributionFunction(q, sf, n, d, "sf")
def _iv_CensoredData(
sample: npt.ArrayLike | CensoredData, param_name: str = 'sample'
) -> CensoredData:
"""Attempt to convert `sample` to `CensoredData`."""
if not isinstance(sample, CensoredData):
try: # takes care of input standardization/validation
sample = CensoredData(uncensored=sample)
except ValueError as e:
message = str(e).replace('uncensored', param_name)
raise type(e)(message) from e
return sample
def ecdf(sample: npt.ArrayLike | CensoredData) -> ECDFResult:
"""Empirical cumulative distribution function of a sample.
The empirical cumulative distribution function (ECDF) is a step function
estimate of the CDF of the distribution underlying a sample. This function
returns objects representing both the empirical distribution function and
its complement, the empirical survival function.
Parameters
----------
sample : 1D array_like or `scipy.stats.CensoredData`
Besides array_like, instances of `scipy.stats.CensoredData` containing
uncensored and right-censored observations are supported. Currently,
other instances of `scipy.stats.CensoredData` will result in a
``NotImplementedError``.
Returns
-------
res : `~scipy.stats._result_classes.ECDFResult`
An object with the following attributes.
cdf : `~scipy.stats._result_classes.EmpiricalDistributionFunction`
An object representing the empirical cumulative distribution
function.
sf : `~scipy.stats._result_classes.EmpiricalDistributionFunction`
An object representing the empirical survival function.
The `cdf` and `sf` attributes themselves have the following attributes.
quantiles : ndarray
The unique values in the sample that defines the empirical CDF/SF.
probabilities : ndarray
The point estimates of the probabilities corresponding with
`quantiles`.
And the following methods:
evaluate(x) :
Evaluate the CDF/SF at the argument.
plot(ax) :
Plot the CDF/SF on the provided axes.
confidence_interval(confidence_level=0.95) :
Compute the confidence interval around the CDF/SF at the values in
`quantiles`.
Notes
-----
When each observation of the sample is a precise measurement, the ECDF
steps up by ``1/len(sample)`` at each of the observations [1]_.
When observations are lower bounds, upper bounds, or both upper and lower
bounds, the data is said to be "censored", and `sample` may be provided as
an instance of `scipy.stats.CensoredData`.
For right-censored data, the ECDF is given by the Kaplan-Meier estimator
[2]_; other forms of censoring are not supported at this time.
Confidence intervals are computed according to the Greenwood formula or the
more recent "Exponential Greenwood" formula as described in [4]_.
References
----------
.. [1] Conover, William Jay. Practical nonparametric statistics. Vol. 350.
John Wiley & Sons, 1999.
.. [2] Kaplan, Edward L., and Paul Meier. "Nonparametric estimation from
incomplete observations." Journal of the American statistical
association 53.282 (1958): 457-481.
.. [3] Goel, Manish Kumar, Pardeep Khanna, and Jugal Kishore.
"Understanding survival analysis: Kaplan-Meier estimate."
International journal of Ayurveda research 1.4 (2010): 274.
.. [4] Sawyer, Stanley. "The Greenwood and Exponential Greenwood Confidence
Intervals in Survival Analysis."
https://www.math.wustl.edu/~sawyer/handouts/greenwood.pdf
Examples
--------
**Uncensored Data**
As in the example from [1]_ page 79, five boys were selected at random from
those in a single high school. Their one-mile run times were recorded as
follows.
>>> sample = [6.23, 5.58, 7.06, 6.42, 5.20] # one-mile run times (minutes)
The empirical distribution function, which approximates the distribution
function of one-mile run times of the population from which the boys were
sampled, is calculated as follows.
>>> from scipy import stats
>>> res = stats.ecdf(sample)
>>> res.cdf.quantiles
array([5.2 , 5.58, 6.23, 6.42, 7.06])
>>> res.cdf.probabilities
array([0.2, 0.4, 0.6, 0.8, 1. ])
To plot the result as a step function:
>>> import matplotlib.pyplot as plt
>>> ax = plt.subplot()
>>> res.cdf.plot(ax)
>>> ax.set_xlabel('One-Mile Run Time (minutes)')
>>> ax.set_ylabel('Empirical CDF')
>>> plt.show()
**Right-censored Data**
As in the example from [1]_ page 91, the lives of ten car fanbelts were
tested. Five tests concluded because the fanbelt being tested broke, but
the remaining tests concluded for other reasons (e.g. the study ran out of
funding, but the fanbelt was still functional). The mileage driven
with the fanbelts were recorded as follows.
>>> broken = [77, 47, 81, 56, 80] # in thousands of miles driven
>>> unbroken = [62, 60, 43, 71, 37]
Precise survival times of the fanbelts that were still functional at the
end of the tests are unknown, but they are known to exceed the values
recorded in ``unbroken``. Therefore, these observations are said to be
"right-censored", and the data is represented using
`scipy.stats.CensoredData`.
>>> sample = stats.CensoredData(uncensored=broken, right=unbroken)
The empirical survival function is calculated as follows.
>>> res = stats.ecdf(sample)
>>> res.sf.quantiles
array([37., 43., 47., 56., 60., 62., 71., 77., 80., 81.])
>>> res.sf.probabilities
array([1. , 1. , 0.875, 0.75 , 0.75 , 0.75 , 0.75 , 0.5 , 0.25 , 0. ])
To plot the result as a step function:
>>> ax = plt.subplot()
>>> res.cdf.plot(ax)
>>> ax.set_xlabel('Fanbelt Survival Time (thousands of miles)')
>>> ax.set_ylabel('Empirical SF')
>>> plt.show()
"""
sample = _iv_CensoredData(sample)
if sample.num_censored() == 0:
res = _ecdf_uncensored(sample._uncensor())
elif sample.num_censored() == sample._right.size:
res = _ecdf_right_censored(sample)
else:
# Support additional censoring options in follow-up PRs
message = ("Currently, only uncensored and right-censored data is "
"supported.")
raise NotImplementedError(message)
t, cdf, sf, n, d = res
return ECDFResult(t, cdf, sf, n, d)
def _ecdf_uncensored(sample):
sample = np.sort(sample)
x, counts = np.unique(sample, return_counts=True)
# [1].81 "the fraction of [observations] that are less than or equal to x
events = np.cumsum(counts)
n = sample.size
cdf = events / n
# [1].89 "the relative frequency of the sample that exceeds x in value"
sf = 1 - cdf
at_risk = np.concatenate(([n], n - events[:-1]))
return x, cdf, sf, at_risk, counts
def _ecdf_right_censored(sample):
# It is conventional to discuss right-censored data in terms of
# "survival time", "death", and "loss" (e.g. [2]). We'll use that
# terminology here.
# This implementation was influenced by the references cited and also
# https://www.youtube.com/watch?v=lxoWsVco_iM
# https://en.wikipedia.org/wiki/Kaplan%E2%80%93Meier_estimator
# In retrospect it is probably most easily compared against [3].
# Ultimately, the data needs to be sorted, so this implementation is
# written to avoid a separate call to `unique` after sorting. In hope of
# better performance on large datasets, it also computes survival
# probabilities at unique times only rather than at each observation.
tod = sample._uncensored # time of "death"
tol = sample._right # time of "loss"
times = np.concatenate((tod, tol))
died = np.asarray([1]*tod.size + [0]*tol.size)
# sort by times
i = np.argsort(times)
times = times[i]
died = died[i]
at_risk = np.arange(times.size, 0, -1)
# logical indices of unique times
j = np.diff(times, prepend=-np.inf, append=np.inf) > 0
j_l = j[:-1] # first instances of unique times
j_r = j[1:] # last instances of unique times
# get number at risk and deaths at each unique time
t = times[j_l] # unique times
n = at_risk[j_l] # number at risk at each unique time
cd = np.cumsum(died)[j_r] # cumulative deaths up to/including unique times
d = np.diff(cd, prepend=0) # deaths at each unique time
# compute survival function
sf = np.cumprod((n - d) / n)
cdf = 1 - sf
return t, cdf, sf, n, d
@dataclass
class LogRankResult:
"""Result object returned by `scipy.stats.logrank`.
Attributes
----------
statistic : float ndarray
The computed statistic (defined below). Its magnitude is the
square root of the magnitude returned by most other logrank test
implementations.
pvalue : float ndarray
The computed p-value of the test.
"""
statistic: np.ndarray
pvalue: np.ndarray
def logrank(
x: npt.ArrayLike | CensoredData,
y: npt.ArrayLike | CensoredData,
alternative: Literal['two-sided', 'less', 'greater'] = "two-sided"
) -> LogRankResult:
r"""Compare the survival distributions of two samples via the logrank test.
Parameters
----------
x, y : array_like or CensoredData
Samples to compare based on their empirical survival functions.
alternative : {'two-sided', 'less', 'greater'}, optional
Defines the alternative hypothesis.
The null hypothesis is that the survival distributions of the two
groups, say *X* and *Y*, are identical.
The following alternative hypotheses [4]_ are available (default is
'two-sided'):
* 'two-sided': the survival distributions of the two groups are not
identical.
* 'less': survival of group *X* is favored: the group *X* failure rate
function is less than the group *Y* failure rate function at some
times.
* 'greater': survival of group *Y* is favored: the group *X* failure
rate function is greater than the group *Y* failure rate function at
some times.
Returns
-------
res : `~scipy.stats._result_classes.LogRankResult`
An object containing attributes:
statistic : float ndarray
The computed statistic (defined below). Its magnitude is the
square root of the magnitude returned by most other logrank test
implementations.
pvalue : float ndarray
The computed p-value of the test.
See Also
--------
scipy.stats.ecdf
Notes
-----
The logrank test [1]_ compares the observed number of events to
the expected number of events under the null hypothesis that the two
samples were drawn from the same distribution. The statistic is
.. math::
Z_i = \frac{\sum_{j=1}^J(O_{i,j}-E_{i,j})}{\sqrt{\sum_{j=1}^J V_{i,j}}}
\rightarrow \mathcal{N}(0,1)
where
.. math::
E_{i,j} = O_j \frac{N_{i,j}}{N_j},
\qquad
V_{i,j} = E_{i,j} \left(\frac{N_j-O_j}{N_j}\right)
\left(\frac{N_j-N_{i,j}}{N_j-1}\right),
:math:`i` denotes the group (i.e. it may assume values :math:`x` or
:math:`y`, or it may be omitted to refer to the combined sample)
:math:`j` denotes the time (at which an event occurred),
:math:`N` is the number of subjects at risk just before an event occurred,
and :math:`O` is the observed number of events at that time.
The ``statistic`` :math:`Z_x` returned by `logrank` is the (signed) square
root of the statistic returned by many other implementations. Under the
null hypothesis, :math:`Z_x**2` is asymptotically distributed according to
the chi-squared distribution with one degree of freedom. Consequently,
:math:`Z_x` is asymptotically distributed according to the standard normal
distribution. The advantage of using :math:`Z_x` is that the sign
information (i.e. whether the observed number of events tends to be less
than or greater than the number expected under the null hypothesis) is
preserved, allowing `scipy.stats.logrank` to offer one-sided alternative
hypotheses.
References
----------
.. [1] Mantel N. "Evaluation of survival data and two new rank order
statistics arising in its consideration."
Cancer Chemotherapy Reports, 50(3):163-170, PMID: 5910392, 1966
.. [2] Bland, Altman, "The logrank test", BMJ, 328:1073,
:doi:`10.1136/bmj.328.7447.1073`, 2004
.. [3] "Logrank test", Wikipedia,
https://en.wikipedia.org/wiki/Logrank_test
.. [4] Brown, Mark. "On the choice of variance for the log rank test."
Biometrika 71.1 (1984): 65-74.
.. [5] Klein, John P., and Melvin L. Moeschberger. Survival analysis:
techniques for censored and truncated data. Vol. 1230. New York:
Springer, 2003.
Examples
--------
Reference [2]_ compared the survival times of patients with two different
types of recurrent malignant gliomas. The samples below record the time
(number of weeks) for which each patient participated in the study. The
`scipy.stats.CensoredData` class is used because the data is
right-censored: the uncensored observations correspond with observed deaths
whereas the censored observations correspond with the patient leaving the
study for another reason.
>>> from scipy import stats
>>> x = stats.CensoredData(
... uncensored=[6, 13, 21, 30, 37, 38, 49, 50,
... 63, 79, 86, 98, 202, 219],
... right=[31, 47, 80, 82, 82, 149]
... )
>>> y = stats.CensoredData(
... uncensored=[10, 10, 12, 13, 14, 15, 16, 17, 18, 20, 24, 24,
... 25, 28,30, 33, 35, 37, 40, 40, 46, 48, 76, 81,
... 82, 91, 112, 181],
... right=[34, 40, 70]
... )
We can calculate and visualize the empirical survival functions
of both groups as follows.
>>> import numpy as np
>>> import matplotlib.pyplot as plt
>>> ax = plt.subplot()
>>> ecdf_x = stats.ecdf(x)
>>> ecdf_x.sf.plot(ax, label='Astrocytoma')
>>> ecdf_y = stats.ecdf(y)
>>> ecdf_y.sf.plot(ax, label='Glioblastoma')
>>> ax.set_xlabel('Time to death (weeks)')
>>> ax.set_ylabel('Empirical SF')
>>> plt.legend()
>>> plt.show()
Visual inspection of the empirical survival functions suggests that the
survival times tend to be different between the two groups. To formally
assess whether the difference is significant at the 1% level, we use the
logrank test.
>>> res = stats.logrank(x=x, y=y)
>>> res.statistic
-2.73799
>>> res.pvalue
0.00618
The p-value is less than 1%, so we can consider the data to be evidence
against the null hypothesis in favor of the alternative that there is a
difference between the two survival functions.
"""
# Input validation. `alternative` IV handled in `_get_pvalue` below.
x = _iv_CensoredData(sample=x, param_name='x')
y = _iv_CensoredData(sample=y, param_name='y')
# Combined sample. (Under H0, the two groups are identical.)
xy = CensoredData(
uncensored=np.concatenate((x._uncensored, y._uncensored)),
right=np.concatenate((x._right, y._right))
)
# Extract data from the combined sample
res = ecdf(xy)
idx = res.sf._d.astype(bool) # indices of observed events
times_xy = res.sf.quantiles[idx] # unique times of observed events
at_risk_xy = res.sf._n[idx] # combined number of subjects at risk
deaths_xy = res.sf._d[idx] # combined number of events
# Get the number at risk within each sample.
# First compute the number at risk in group X at each of the `times_xy`.
# Could use `interpolate_1d`, but this is more compact.
res_x = ecdf(x)
i = np.searchsorted(res_x.sf.quantiles, times_xy)
at_risk_x = np.append(res_x.sf._n, 0)[i] # 0 at risk after last time
# Subtract from the combined number at risk to get number at risk in Y
at_risk_y = at_risk_xy - at_risk_x
# Compute the variance.
num = at_risk_x * at_risk_y * deaths_xy * (at_risk_xy - deaths_xy)
den = at_risk_xy**2 * (at_risk_xy - 1)
# Note: when `at_risk_xy == 1`, we would have `at_risk_xy - 1 == 0` in the
# numerator and denominator. Simplifying the fraction symbolically, we
# would always find the overall quotient to be zero, so don't compute it.
i = at_risk_xy > 1
sum_var = np.sum(num[i]/den[i])
# Get the observed and expected number of deaths in group X
n_died_x = x._uncensored.size
sum_exp_deaths_x = np.sum(at_risk_x * (deaths_xy/at_risk_xy))
# Compute the statistic. This is the square root of that in references.
statistic = (n_died_x - sum_exp_deaths_x)/np.sqrt(sum_var)
# Equivalent to chi2(df=1).sf(statistic**2) when alternative='two-sided'
norm = stats._stats_py._SimpleNormal()
pvalue = stats._stats_py._get_pvalue(statistic, norm, alternative, xp=np)
return LogRankResult(statistic=statistic[()], pvalue=pvalue[()])

View File

@ -0,0 +1,199 @@
import numpy as np
from numpy import poly1d
from scipy.special import beta
# The following code was used to generate the Pade coefficients for the
# Tukey Lambda variance function. Version 0.17 of mpmath was used.
#---------------------------------------------------------------------------
# import mpmath as mp
#
# mp.mp.dps = 60
#
# one = mp.mpf(1)
# two = mp.mpf(2)
#
# def mpvar(lam):
# if lam == 0:
# v = mp.pi**2 / three
# else:
# v = (two / lam**2) * (one / (one + two*lam) -
# mp.beta(lam + one, lam + one))
# return v
#
# t = mp.taylor(mpvar, 0, 8)
# p, q = mp.pade(t, 4, 4)
# print("p =", [mp.fp.mpf(c) for c in p])
# print("q =", [mp.fp.mpf(c) for c in q])
#---------------------------------------------------------------------------
# Pade coefficients for the Tukey Lambda variance function.
_tukeylambda_var_pc = [3.289868133696453, 0.7306125098871127,
-0.5370742306855439, 0.17292046290190008,
-0.02371146284628187]
_tukeylambda_var_qc = [1.0, 3.683605511659861, 4.184152498888124,
1.7660926747377275, 0.2643989311168465]
# numpy.poly1d instances for the numerator and denominator of the
# Pade approximation to the Tukey Lambda variance.
_tukeylambda_var_p = poly1d(_tukeylambda_var_pc[::-1])
_tukeylambda_var_q = poly1d(_tukeylambda_var_qc[::-1])
def tukeylambda_variance(lam):
"""Variance of the Tukey Lambda distribution.
Parameters
----------
lam : array_like
The lambda values at which to compute the variance.
Returns
-------
v : ndarray
The variance. For lam < -0.5, the variance is not defined, so
np.nan is returned. For lam = 0.5, np.inf is returned.
Notes
-----
In an interval around lambda=0, this function uses the [4,4] Pade
approximation to compute the variance. Otherwise it uses the standard
formula (https://en.wikipedia.org/wiki/Tukey_lambda_distribution). The
Pade approximation is used because the standard formula has a removable
discontinuity at lambda = 0, and does not produce accurate numerical
results near lambda = 0.
"""
lam = np.asarray(lam)
shp = lam.shape
lam = np.atleast_1d(lam).astype(np.float64)
# For absolute values of lam less than threshold, use the Pade
# approximation.
threshold = 0.075
# Play games with masks to implement the conditional evaluation of
# the distribution.
# lambda < -0.5: var = nan
low_mask = lam < -0.5
# lambda == -0.5: var = inf
neghalf_mask = lam == -0.5
# abs(lambda) < threshold: use Pade approximation
small_mask = np.abs(lam) < threshold
# else the "regular" case: use the explicit formula.
reg_mask = ~(low_mask | neghalf_mask | small_mask)
# Get the 'lam' values for the cases where they are needed.
small = lam[small_mask]
reg = lam[reg_mask]
# Compute the function for each case.
v = np.empty_like(lam)
v[low_mask] = np.nan
v[neghalf_mask] = np.inf
if small.size > 0:
# Use the Pade approximation near lambda = 0.
v[small_mask] = _tukeylambda_var_p(small) / _tukeylambda_var_q(small)
if reg.size > 0:
v[reg_mask] = (2.0 / reg**2) * (1.0 / (1.0 + 2 * reg) -
beta(reg + 1, reg + 1))
v.shape = shp
return v
# The following code was used to generate the Pade coefficients for the
# Tukey Lambda kurtosis function. Version 0.17 of mpmath was used.
#---------------------------------------------------------------------------
# import mpmath as mp
#
# mp.mp.dps = 60
#
# one = mp.mpf(1)
# two = mp.mpf(2)
# three = mp.mpf(3)
# four = mp.mpf(4)
#
# def mpkurt(lam):
# if lam == 0:
# k = mp.mpf(6)/5
# else:
# numer = (one/(four*lam+one) - four*mp.beta(three*lam+one, lam+one) +
# three*mp.beta(two*lam+one, two*lam+one))
# denom = two*(one/(two*lam+one) - mp.beta(lam+one,lam+one))**2
# k = numer / denom - three
# return k
#
# # There is a bug in mpmath 0.17: when we use the 'method' keyword of the
# # taylor function and we request a degree 9 Taylor polynomial, we actually
# # get degree 8.
# t = mp.taylor(mpkurt, 0, 9, method='quad', radius=0.01)
# t = [mp.chop(c, tol=1e-15) for c in t]
# p, q = mp.pade(t, 4, 4)
# print("p =", [mp.fp.mpf(c) for c in p])
# print("q =", [mp.fp.mpf(c) for c in q])
#---------------------------------------------------------------------------
# Pade coefficients for the Tukey Lambda kurtosis function.
_tukeylambda_kurt_pc = [1.2, -5.853465139719495, -22.653447381131077,
0.20601184383406815, 4.59796302262789]
_tukeylambda_kurt_qc = [1.0, 7.171149192233599, 12.96663094361842,
0.43075235247853005, -2.789746758009912]
# numpy.poly1d instances for the numerator and denominator of the
# Pade approximation to the Tukey Lambda kurtosis.
_tukeylambda_kurt_p = poly1d(_tukeylambda_kurt_pc[::-1])
_tukeylambda_kurt_q = poly1d(_tukeylambda_kurt_qc[::-1])
def tukeylambda_kurtosis(lam):
"""Kurtosis of the Tukey Lambda distribution.
Parameters
----------
lam : array_like
The lambda values at which to compute the variance.
Returns
-------
v : ndarray
The variance. For lam < -0.25, the variance is not defined, so
np.nan is returned. For lam = 0.25, np.inf is returned.
"""
lam = np.asarray(lam)
shp = lam.shape
lam = np.atleast_1d(lam).astype(np.float64)
# For absolute values of lam less than threshold, use the Pade
# approximation.
threshold = 0.055
# Use masks to implement the conditional evaluation of the kurtosis.
# lambda < -0.25: kurtosis = nan
low_mask = lam < -0.25
# lambda == -0.25: kurtosis = inf
negqrtr_mask = lam == -0.25
# lambda near 0: use Pade approximation
small_mask = np.abs(lam) < threshold
# else the "regular" case: use the explicit formula.
reg_mask = ~(low_mask | negqrtr_mask | small_mask)
# Get the 'lam' values for the cases where they are needed.
small = lam[small_mask]
reg = lam[reg_mask]
# Compute the function for each case.
k = np.empty_like(lam)
k[low_mask] = np.nan
k[negqrtr_mask] = np.inf
if small.size > 0:
k[small_mask] = _tukeylambda_kurt_p(small) / _tukeylambda_kurt_q(small)
if reg.size > 0:
numer = (1.0 / (4 * reg + 1) - 4 * beta(3 * reg + 1, reg + 1) +
3 * beta(2 * reg + 1, 2 * reg + 1))
denom = 2 * (1.0/(2 * reg + 1) - beta(reg + 1, reg + 1))**2
k[reg_mask] = numer / denom - 3
# The return value will be a numpy array; resetting the shape ensures that
# if `lam` was a scalar, the return value is a 0-d array.
k.shape = shp
return k

View File

@ -0,0 +1,179 @@
from __future__ import annotations
import numpy as np
from typing import (overload, Callable, NamedTuple, Protocol)
import numpy.typing as npt
from scipy._lib._util import SeedType
import scipy.stats as stats
ArrayLike0D = bool | int | float | complex | str | bytes | np.generic
__all__: list[str]
class UNURANError(RuntimeError):
...
class Method:
@overload
def rvs(self, size: None = ...) -> float | int: ... # type: ignore[overload-overlap]
@overload
def rvs(self, size: int | tuple[int, ...] = ...) -> np.ndarray: ...
def set_random_state(self, random_state: SeedType) -> None: ...
class TDRDist(Protocol):
@property
def pdf(self) -> Callable[..., float]: ...
@property
def dpdf(self) -> Callable[..., float]: ...
@property
def support(self) -> tuple[float, float]: ...
class TransformedDensityRejection(Method):
def __init__(self,
dist: TDRDist,
*,
mode: None | float = ...,
center: None | float = ...,
domain: None | tuple[float, float] = ...,
c: float = ...,
construction_points: int | npt.ArrayLike = ...,
use_dars: bool = ...,
max_squeeze_hat_ratio: float = ...,
random_state: SeedType = ...) -> None: ...
@property
def squeeze_hat_ratio(self) -> float: ...
@property
def squeeze_area(self) -> float: ...
@overload
def ppf_hat(self, u: ArrayLike0D) -> float: ... # type: ignore[overload-overlap]
@overload
def ppf_hat(self, u: npt.ArrayLike) -> np.ndarray: ...
class SROUDist(Protocol):
@property
def pdf(self) -> Callable[..., float]: ...
@property
def support(self) -> tuple[float, float]: ...
class SimpleRatioUniforms(Method):
def __init__(self,
dist: SROUDist,
*,
mode: None | float = ...,
pdf_area: float = ...,
domain: None | tuple[float, float] = ...,
cdf_at_mode: float = ...,
random_state: SeedType = ...) -> None: ...
class UError(NamedTuple):
max_error: float
mean_absolute_error: float
class PINVDist(Protocol):
@property
def pdf(self) -> Callable[..., float]: ...
@property
def cdf(self) -> Callable[..., float]: ...
@property
def logpdf(self) -> Callable[..., float]: ...
class NumericalInversePolynomial(Method):
def __init__(self,
dist: PINVDist,
*,
mode: None | float = ...,
center: None | float = ...,
domain: None | tuple[float, float] = ...,
order: int = ...,
u_resolution: float = ...,
random_state: SeedType = ...) -> None: ...
@property
def intervals(self) -> int: ...
@overload
def ppf(self, u: ArrayLike0D) -> float: ... # type: ignore[overload-overlap]
@overload
def ppf(self, u: npt.ArrayLike) -> np.ndarray: ...
@overload
def cdf(self, x: ArrayLike0D) -> float: ... # type: ignore[overload-overlap]
@overload
def cdf(self, x: npt.ArrayLike) -> np.ndarray: ...
def u_error(self, sample_size: int = ...) -> UError: ...
def qrvs(self,
size: None | int | tuple[int, ...] = ...,
d: None | int = ...,
qmc_engine: None | stats.qmc.QMCEngine = ...) -> npt.ArrayLike: ...
class HINVDist(Protocol):
@property
def pdf(self) -> Callable[..., float]: ...
@property
def cdf(self) -> Callable[..., float]: ...
@property
def support(self) -> tuple[float, float]: ...
class NumericalInverseHermite(Method):
def __init__(self,
dist: HINVDist,
*,
domain: None | tuple[float, float] = ...,
order: int= ...,
u_resolution: float = ...,
construction_points: None | npt.ArrayLike = ...,
max_intervals: int = ...,
random_state: SeedType = ...) -> None: ...
@property
def intervals(self) -> int: ...
@overload
def ppf(self, u: ArrayLike0D) -> float: ... # type: ignore[overload-overlap]
@overload
def ppf(self, u: npt.ArrayLike) -> np.ndarray: ...
def qrvs(self,
size: None | int | tuple[int, ...] = ...,
d: None | int = ...,
qmc_engine: None | stats.qmc.QMCEngine = ...) -> npt.ArrayLike: ...
def u_error(self, sample_size: int = ...) -> UError: ...
class DAUDist(Protocol):
@property
def pmf(self) -> Callable[..., float]: ...
@property
def support(self) -> tuple[float, float]: ...
class DiscreteAliasUrn(Method):
def __init__(self,
dist: npt.ArrayLike | DAUDist,
*,
domain: None | tuple[float, float] = ...,
urn_factor: float = ...,
random_state: SeedType = ...) -> None: ...
class DGTDist(Protocol):
@property
def pmf(self) -> Callable[..., float]: ...
@property
def support(self) -> tuple[float, float]: ...
class DiscreteGuideTable(Method):
def __init__(self,
dist: npt.ArrayLike | DGTDist,
*,
domain: None | tuple[float, float] = ...,
guide_factor: float = ...,
random_state: SeedType = ...) -> None: ...
@overload
def ppf(self, u: ArrayLike0D) -> float: ... # type: ignore[overload-overlap]
@overload
def ppf(self, u: npt.ArrayLike) -> np.ndarray: ...

View File

@ -0,0 +1,128 @@
import numpy as np
from scipy._lib._util import _get_nan
from scipy._lib._array_api import array_namespace, xp_copysign
from ._axis_nan_policy import _axis_nan_policy_factory
@_axis_nan_policy_factory(
lambda x: x, n_outputs=1, result_to_tuple=lambda x: (x,)
)
def variation(a, axis=0, nan_policy='propagate', ddof=0, *, keepdims=False):
"""
Compute the coefficient of variation.
The coefficient of variation is the standard deviation divided by the
mean. This function is equivalent to::
np.std(x, axis=axis, ddof=ddof) / np.mean(x)
The default for ``ddof`` is 0, but many definitions of the coefficient
of variation use the square root of the unbiased sample variance
for the sample standard deviation, which corresponds to ``ddof=1``.
The function does not take the absolute value of the mean of the data,
so the return value is negative if the mean is negative.
Parameters
----------
a : array_like
Input array.
axis : int or None, optional
Axis along which to calculate the coefficient of variation.
Default is 0. If None, compute over the whole array `a`.
nan_policy : {'propagate', 'raise', 'omit'}, optional
Defines how to handle when input contains ``nan``.
The following options are available:
* 'propagate': return ``nan``
* 'raise': raise an exception
* 'omit': perform the calculation with ``nan`` values omitted
The default is 'propagate'.
ddof : int, optional
Gives the "Delta Degrees Of Freedom" used when computing the
standard deviation. The divisor used in the calculation of the
standard deviation is ``N - ddof``, where ``N`` is the number of
elements. `ddof` must be less than ``N``; if it isn't, the result
will be ``nan`` or ``inf``, depending on ``N`` and the values in
the array. By default `ddof` is zero for backwards compatibility,
but it is recommended to use ``ddof=1`` to ensure that the sample
standard deviation is computed as the square root of the unbiased
sample variance.
Returns
-------
variation : ndarray
The calculated variation along the requested axis.
Notes
-----
There are several edge cases that are handled without generating a
warning:
* If both the mean and the standard deviation are zero, ``nan``
is returned.
* If the mean is zero and the standard deviation is nonzero, ``inf``
is returned.
* If the input has length zero (either because the array has zero
length, or all the input values are ``nan`` and ``nan_policy`` is
``'omit'``), ``nan`` is returned.
* If the input contains ``inf``, ``nan`` is returned.
References
----------
.. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
Probability and Statistics Tables and Formulae. Chapman & Hall: New
York. 2000.
Examples
--------
>>> import numpy as np
>>> from scipy.stats import variation
>>> variation([1, 2, 3, 4, 5], ddof=1)
0.5270462766947299
Compute the variation along a given dimension of an array that contains
a few ``nan`` values:
>>> x = np.array([[ 10.0, np.nan, 11.0, 19.0, 23.0, 29.0, 98.0],
... [ 29.0, 30.0, 32.0, 33.0, 35.0, 56.0, 57.0],
... [np.nan, np.nan, 12.0, 13.0, 16.0, 16.0, 17.0]])
>>> variation(x, axis=1, ddof=1, nan_policy='omit')
array([1.05109361, 0.31428986, 0.146483 ])
"""
xp = array_namespace(a)
a = xp.asarray(a)
# `nan_policy` and `keepdims` are handled by `_axis_nan_policy`
# `axis=None` is only handled for NumPy backend
if axis is None:
a = xp.reshape(a, (-1,))
axis = 0
n = a.shape[axis]
NaN = _get_nan(a)
if a.size == 0 or ddof > n:
# Handle as a special case to avoid spurious warnings.
# The return values, if any, are all nan.
shp = list(a.shape)
shp.pop(axis)
result = xp.full(shp, fill_value=NaN)
return result[()] if result.ndim == 0 else result
mean_a = xp.mean(a, axis=axis)
if ddof == n:
# Another special case. Result is either inf or nan.
std_a = xp.std(a, axis=axis, correction=0)
result = xp.where(std_a > 0, xp_copysign(xp.asarray(xp.inf), mean_a), NaN)
return result[()] if result.ndim == 0 else result
with np.errstate(divide='ignore', invalid='ignore'):
std_a = xp.std(a, axis=axis, correction=ddof)
result = std_a / mean_a
return result[()] if result.ndim == 0 else result

View File

@ -0,0 +1,38 @@
# Warnings
class DegenerateDataWarning(RuntimeWarning):
"""Warns when data is degenerate and results may not be reliable."""
def __init__(self, msg=None):
if msg is None:
msg = ("Degenerate data encountered; results may not be reliable.")
self.args = (msg,)
class ConstantInputWarning(DegenerateDataWarning):
"""Warns when all values in data are exactly equal."""
def __init__(self, msg=None):
if msg is None:
msg = ("All values in data are exactly equal; "
"results may not be reliable.")
self.args = (msg,)
class NearConstantInputWarning(DegenerateDataWarning):
"""Warns when all values in data are nearly equal."""
def __init__(self, msg=None):
if msg is None:
msg = ("All values in data are nearly equal; "
"results may not be reliable.")
self.args = (msg,)
# Errors
class FitError(RuntimeError):
"""Represents an error condition when fitting a distribution to data."""
def __init__(self, msg=None):
if msg is None:
msg = ("An error occurred when fitting a distribution to data.")
self.args = (msg,)

View File

@ -0,0 +1,246 @@
import warnings
import numpy as np
from scipy import stats
from ._stats_py import _get_pvalue, _rankdata, _SimpleNormal
from . import _morestats
from ._axis_nan_policy import _broadcast_arrays
from ._hypotests import _get_wilcoxon_distr
from scipy._lib._util import _lazywhere, _get_nan
class WilcoxonDistribution:
def __init__(self, n):
n = np.asarray(n).astype(int, copy=False)
self.n = n
self._dists = {ni: _get_wilcoxon_distr(ni) for ni in np.unique(n)}
def _cdf1(self, k, n):
pmfs = self._dists[n]
return pmfs[:k + 1].sum()
def _cdf(self, k, n):
return np.vectorize(self._cdf1, otypes=[float])(k, n)
def _sf1(self, k, n):
pmfs = self._dists[n]
return pmfs[k:].sum()
def _sf(self, k, n):
return np.vectorize(self._sf1, otypes=[float])(k, n)
def mean(self):
return self.n * (self.n + 1) / 4
def _prep(self, k):
k = np.asarray(k).astype(int, copy=False)
mn = self.mean()
out = np.empty(k.shape, dtype=np.float64)
return k, mn, out
def cdf(self, k):
k, mn, out = self._prep(k)
return _lazywhere(k <= mn, (k, self.n), self._cdf,
f2=lambda k, n: 1 - self._sf(k+1, n))[()]
def sf(self, k):
k, mn, out = self._prep(k)
return _lazywhere(k <= mn, (k, self.n), self._sf,
f2=lambda k, n: 1 - self._cdf(k-1, n))[()]
def _wilcoxon_iv(x, y, zero_method, correction, alternative, method, axis):
axis = np.asarray(axis)[()]
message = "`axis` must be an integer."
if not np.issubdtype(axis.dtype, np.integer) or axis.ndim != 0:
raise ValueError(message)
message = '`axis` must be compatible with the shape(s) of `x` (and `y`)'
try:
if y is None:
x = np.asarray(x)
d = x
else:
x, y = _broadcast_arrays((x, y), axis=axis)
d = x - y
d = np.moveaxis(d, axis, -1)
except np.AxisError as e:
raise ValueError(message) from e
message = "`x` and `y` must have the same length along `axis`."
if y is not None and x.shape[axis] != y.shape[axis]:
raise ValueError(message)
message = "`x` (and `y`, if provided) must be an array of real numbers."
if np.issubdtype(d.dtype, np.integer):
d = d.astype(np.float64)
if not np.issubdtype(d.dtype, np.floating):
raise ValueError(message)
zero_method = str(zero_method).lower()
zero_methods = {"wilcox", "pratt", "zsplit"}
message = f"`zero_method` must be one of {zero_methods}."
if zero_method not in zero_methods:
raise ValueError(message)
corrections = {True, False}
message = f"`correction` must be one of {corrections}."
if correction not in corrections:
raise ValueError(message)
alternative = str(alternative).lower()
alternatives = {"two-sided", "less", "greater"}
message = f"`alternative` must be one of {alternatives}."
if alternative not in alternatives:
raise ValueError(message)
if not isinstance(method, stats.PermutationMethod):
methods = {"auto", "approx", "exact"}
message = (f"`method` must be one of {methods} or "
"an instance of `stats.PermutationMethod`.")
if method not in methods:
raise ValueError(message)
output_z = True if method == 'approx' else False
# logic unchanged here for backward compatibility
n_zero = np.sum(d == 0, axis=-1)
has_zeros = np.any(n_zero > 0)
if method == "auto":
if d.shape[-1] <= 50 and not has_zeros:
method = "exact"
else:
method = "approx"
n_zero = np.sum(d == 0)
if n_zero > 0 and method == "exact":
method = "approx"
warnings.warn("Exact p-value calculation does not work if there are "
"zeros. Switching to normal approximation.",
stacklevel=2)
if (method == "approx" and zero_method in ["wilcox", "pratt"]
and n_zero == d.size and d.size > 0 and d.ndim == 1):
raise ValueError("zero_method 'wilcox' and 'pratt' do not "
"work if x - y is zero for all elements.")
if 0 < d.shape[-1] < 10 and method == "approx":
warnings.warn("Sample size too small for normal approximation.", stacklevel=2)
return d, zero_method, correction, alternative, method, axis, output_z
def _wilcoxon_statistic(d, zero_method='wilcox'):
i_zeros = (d == 0)
if zero_method == 'wilcox':
# Wilcoxon's method for treating zeros was to remove them from
# the calculation. We do this by replacing 0s with NaNs, which
# are ignored anyway.
if not d.flags['WRITEABLE']:
d = d.copy()
d[i_zeros] = np.nan
i_nan = np.isnan(d)
n_nan = np.sum(i_nan, axis=-1)
count = d.shape[-1] - n_nan
r, t = _rankdata(abs(d), 'average', return_ties=True)
r_plus = np.sum((d > 0) * r, axis=-1)
r_minus = np.sum((d < 0) * r, axis=-1)
if zero_method == "zsplit":
# The "zero-split" method for treating zeros is to add half their contribution
# to r_plus and half to r_minus.
# See gh-2263 for the origin of this method.
r_zero_2 = np.sum(i_zeros * r, axis=-1) / 2
r_plus += r_zero_2
r_minus += r_zero_2
mn = count * (count + 1.) * 0.25
se = count * (count + 1.) * (2. * count + 1.)
if zero_method == "pratt":
# Pratt's method for treating zeros was just to modify the z-statistic.
# normal approximation needs to be adjusted, see Cureton (1967)
n_zero = i_zeros.sum(axis=-1)
mn -= n_zero * (n_zero + 1.) * 0.25
se -= n_zero * (n_zero + 1.) * (2. * n_zero + 1.)
# zeros are not to be included in tie-correction.
# any tie counts corresponding with zeros are in the 0th column
t[i_zeros.any(axis=-1), 0] = 0
tie_correct = (t**3 - t).sum(axis=-1)
se -= tie_correct/2
se = np.sqrt(se / 24)
z = (r_plus - mn) / se
return r_plus, r_minus, se, z, count
def _correction_sign(z, alternative):
if alternative == 'greater':
return 1
elif alternative == 'less':
return -1
else:
return np.sign(z)
def _wilcoxon_nd(x, y=None, zero_method='wilcox', correction=True,
alternative='two-sided', method='auto', axis=0):
temp = _wilcoxon_iv(x, y, zero_method, correction, alternative, method, axis)
d, zero_method, correction, alternative, method, axis, output_z = temp
if d.size == 0:
NaN = _get_nan(d)
res = _morestats.WilcoxonResult(statistic=NaN, pvalue=NaN)
if method == 'approx':
res.zstatistic = NaN
return res
r_plus, r_minus, se, z, count = _wilcoxon_statistic(d, zero_method)
if method == 'approx':
if correction:
sign = _correction_sign(z, alternative)
z -= sign * 0.5 / se
p = _get_pvalue(z, _SimpleNormal(), alternative, xp=np)
elif method == 'exact':
dist = WilcoxonDistribution(count)
# The null distribution in `dist` is exact only if there are no ties
# or zeros. If there are ties or zeros, the statistic can be non-
# integral, but the null distribution is only defined for integral
# values of the statistic. Therefore, we're conservative: round
# non-integral statistic up before computing CDF and down before
# computing SF. This preserves symmetry w.r.t. alternatives and
# order of the input arguments. See gh-19872.
if alternative == 'less':
p = dist.cdf(np.ceil(r_plus))
elif alternative == 'greater':
p = dist.sf(np.floor(r_plus))
else:
p = 2 * np.minimum(dist.sf(np.floor(r_plus)),
dist.cdf(np.ceil(r_plus)))
p = np.clip(p, 0, 1)
else: # `PermutationMethod` instance (already validated)
p = stats.permutation_test(
(d,), lambda d: _wilcoxon_statistic(d, zero_method)[0],
permutation_type='samples', **method._asdict(),
alternative=alternative, axis=-1).pvalue
# for backward compatibility...
statistic = np.minimum(r_plus, r_minus) if alternative=='two-sided' else r_plus
z = -np.abs(z) if (alternative == 'two-sided' and method == 'approx') else z
res = _morestats.WilcoxonResult(statistic=statistic, pvalue=p[()])
if output_z:
res.zstatistic = z[()]
return res

View File

@ -0,0 +1,16 @@
# This file is not meant for public use and will be removed in SciPy v2.0.0.
from scipy._lib.deprecation import _sub_module_deprecation
__all__: list[str] = []
def __dir__():
return __all__
def __getattr__(name):
return _sub_module_deprecation(sub_package="stats", module="biasedurn",
private_modules=["_biasedurn"], all=__all__,
attribute=name)

View File

@ -0,0 +1,468 @@
"""
Contingency table functions (:mod:`scipy.stats.contingency`)
============================================================
Functions for creating and analyzing contingency tables.
.. currentmodule:: scipy.stats.contingency
.. autosummary::
:toctree: generated/
chi2_contingency
relative_risk
odds_ratio
crosstab
association
expected_freq
margins
"""
from functools import reduce
import math
import numpy as np
from ._stats_py import power_divergence
from ._relative_risk import relative_risk
from ._crosstab import crosstab
from ._odds_ratio import odds_ratio
from scipy._lib._bunch import _make_tuple_bunch
__all__ = ['margins', 'expected_freq', 'chi2_contingency', 'crosstab',
'association', 'relative_risk', 'odds_ratio']
def margins(a):
"""Return a list of the marginal sums of the array `a`.
Parameters
----------
a : ndarray
The array for which to compute the marginal sums.
Returns
-------
margsums : list of ndarrays
A list of length `a.ndim`. `margsums[k]` is the result
of summing `a` over all axes except `k`; it has the same
number of dimensions as `a`, but the length of each axis
except axis `k` will be 1.
Examples
--------
>>> import numpy as np
>>> from scipy.stats.contingency import margins
>>> a = np.arange(12).reshape(2, 6)
>>> a
array([[ 0, 1, 2, 3, 4, 5],
[ 6, 7, 8, 9, 10, 11]])
>>> m0, m1 = margins(a)
>>> m0
array([[15],
[51]])
>>> m1
array([[ 6, 8, 10, 12, 14, 16]])
>>> b = np.arange(24).reshape(2,3,4)
>>> m0, m1, m2 = margins(b)
>>> m0
array([[[ 66]],
[[210]]])
>>> m1
array([[[ 60],
[ 92],
[124]]])
>>> m2
array([[[60, 66, 72, 78]]])
"""
margsums = []
ranged = list(range(a.ndim))
for k in ranged:
marg = np.apply_over_axes(np.sum, a, [j for j in ranged if j != k])
margsums.append(marg)
return margsums
def expected_freq(observed):
"""
Compute the expected frequencies from a contingency table.
Given an n-dimensional contingency table of observed frequencies,
compute the expected frequencies for the table based on the marginal
sums under the assumption that the groups associated with each
dimension are independent.
Parameters
----------
observed : array_like
The table of observed frequencies. (While this function can handle
a 1-D array, that case is trivial. Generally `observed` is at
least 2-D.)
Returns
-------
expected : ndarray of float64
The expected frequencies, based on the marginal sums of the table.
Same shape as `observed`.
Examples
--------
>>> import numpy as np
>>> from scipy.stats.contingency import expected_freq
>>> observed = np.array([[10, 10, 20],[20, 20, 20]])
>>> expected_freq(observed)
array([[ 12., 12., 16.],
[ 18., 18., 24.]])
"""
# Typically `observed` is an integer array. If `observed` has a large
# number of dimensions or holds large values, some of the following
# computations may overflow, so we first switch to floating point.
observed = np.asarray(observed, dtype=np.float64)
# Create a list of the marginal sums.
margsums = margins(observed)
# Create the array of expected frequencies. The shapes of the
# marginal sums returned by apply_over_axes() are just what we
# need for broadcasting in the following product.
d = observed.ndim
expected = reduce(np.multiply, margsums) / observed.sum() ** (d - 1)
return expected
Chi2ContingencyResult = _make_tuple_bunch(
'Chi2ContingencyResult',
['statistic', 'pvalue', 'dof', 'expected_freq'], []
)
def chi2_contingency(observed, correction=True, lambda_=None):
"""Chi-square test of independence of variables in a contingency table.
This function computes the chi-square statistic and p-value for the
hypothesis test of independence of the observed frequencies in the
contingency table [1]_ `observed`. The expected frequencies are computed
based on the marginal sums under the assumption of independence; see
`scipy.stats.contingency.expected_freq`. The number of degrees of
freedom is (expressed using numpy functions and attributes)::
dof = observed.size - sum(observed.shape) + observed.ndim - 1
Parameters
----------
observed : array_like
The contingency table. The table contains the observed frequencies
(i.e. number of occurrences) in each category. In the two-dimensional
case, the table is often described as an "R x C table".
correction : bool, optional
If True, *and* the degrees of freedom is 1, apply Yates' correction
for continuity. The effect of the correction is to adjust each
observed value by 0.5 towards the corresponding expected value.
lambda_ : float or str, optional
By default, the statistic computed in this test is Pearson's
chi-squared statistic [2]_. `lambda_` allows a statistic from the
Cressie-Read power divergence family [3]_ to be used instead. See
`scipy.stats.power_divergence` for details.
Returns
-------
res : Chi2ContingencyResult
An object containing attributes:
statistic : float
The test statistic.
pvalue : float
The p-value of the test.
dof : int
The degrees of freedom.
expected_freq : ndarray, same shape as `observed`
The expected frequencies, based on the marginal sums of the table.
See Also
--------
scipy.stats.contingency.expected_freq
scipy.stats.fisher_exact
scipy.stats.chisquare
scipy.stats.power_divergence
scipy.stats.barnard_exact
scipy.stats.boschloo_exact
Notes
-----
An often quoted guideline for the validity of this calculation is that
the test should be used only if the observed and expected frequencies
in each cell are at least 5.
This is a test for the independence of different categories of a
population. The test is only meaningful when the dimension of
`observed` is two or more. Applying the test to a one-dimensional
table will always result in `expected` equal to `observed` and a
chi-square statistic equal to 0.
This function does not handle masked arrays, because the calculation
does not make sense with missing values.
Like `scipy.stats.chisquare`, this function computes a chi-square
statistic; the convenience this function provides is to figure out the
expected frequencies and degrees of freedom from the given contingency
table. If these were already known, and if the Yates' correction was not
required, one could use `scipy.stats.chisquare`. That is, if one calls::
res = chi2_contingency(obs, correction=False)
then the following is true::
(res.statistic, res.pvalue) == stats.chisquare(obs.ravel(),
f_exp=ex.ravel(),
ddof=obs.size - 1 - dof)
The `lambda_` argument was added in version 0.13.0 of scipy.
References
----------
.. [1] "Contingency table",
https://en.wikipedia.org/wiki/Contingency_table
.. [2] "Pearson's chi-squared test",
https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
.. [3] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit
Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984),
pp. 440-464.
.. [4] Berger, Jeffrey S. et al. "Aspirin for the Primary Prevention of
Cardiovascular Events in Women and Men: A Sex-Specific
Meta-analysis of Randomized Controlled Trials."
JAMA, 295(3):306-313, :doi:`10.1001/jama.295.3.306`, 2006.
Examples
--------
In [4]_, the use of aspirin to prevent cardiovascular events in women
and men was investigated. The study notably concluded:
...aspirin therapy reduced the risk of a composite of
cardiovascular events due to its effect on reducing the risk of
ischemic stroke in women [...]
The article lists studies of various cardiovascular events. Let's
focus on the ischemic stoke in women.
The following table summarizes the results of the experiment in which
participants took aspirin or a placebo on a regular basis for several
years. Cases of ischemic stroke were recorded::
Aspirin Control/Placebo
Ischemic stroke 176 230
No stroke 21035 21018
Is there evidence that the aspirin reduces the risk of ischemic stroke?
We begin by formulating a null hypothesis :math:`H_0`:
The effect of aspirin is equivalent to that of placebo.
Let's assess the plausibility of this hypothesis with
a chi-square test.
>>> import numpy as np
>>> from scipy.stats import chi2_contingency
>>> table = np.array([[176, 230], [21035, 21018]])
>>> res = chi2_contingency(table)
>>> res.statistic
6.892569132546561
>>> res.pvalue
0.008655478161175739
Using a significance level of 5%, we would reject the null hypothesis in
favor of the alternative hypothesis: "the effect of aspirin
is not equivalent to the effect of placebo".
Because `scipy.stats.contingency.chi2_contingency` performs a two-sided
test, the alternative hypothesis does not indicate the direction of the
effect. We can use `stats.contingency.odds_ratio` to support the
conclusion that aspirin *reduces* the risk of ischemic stroke.
Below are further examples showing how larger contingency tables can be
tested.
A two-way example (2 x 3):
>>> obs = np.array([[10, 10, 20], [20, 20, 20]])
>>> res = chi2_contingency(obs)
>>> res.statistic
2.7777777777777777
>>> res.pvalue
0.24935220877729619
>>> res.dof
2
>>> res.expected_freq
array([[ 12., 12., 16.],
[ 18., 18., 24.]])
Perform the test using the log-likelihood ratio (i.e. the "G-test")
instead of Pearson's chi-squared statistic.
>>> res = chi2_contingency(obs, lambda_="log-likelihood")
>>> res.statistic
2.7688587616781319
>>> res.pvalue
0.25046668010954165
A four-way example (2 x 2 x 2 x 2):
>>> obs = np.array(
... [[[[12, 17],
... [11, 16]],
... [[11, 12],
... [15, 16]]],
... [[[23, 15],
... [30, 22]],
... [[14, 17],
... [15, 16]]]])
>>> res = chi2_contingency(obs)
>>> res.statistic
8.7584514426741897
>>> res.pvalue
0.64417725029295503
"""
observed = np.asarray(observed)
if np.any(observed < 0):
raise ValueError("All values in `observed` must be nonnegative.")
if observed.size == 0:
raise ValueError("No data; `observed` has size 0.")
expected = expected_freq(observed)
if np.any(expected == 0):
# Include one of the positions where expected is zero in
# the exception message.
zeropos = list(zip(*np.nonzero(expected == 0)))[0]
raise ValueError("The internally computed table of expected "
f"frequencies has a zero element at {zeropos}.")
# The degrees of freedom
dof = expected.size - sum(expected.shape) + expected.ndim - 1
if dof == 0:
# Degenerate case; this occurs when `observed` is 1D (or, more
# generally, when it has only one nontrivial dimension). In this
# case, we also have observed == expected, so chi2 is 0.
chi2 = 0.0
p = 1.0
else:
if dof == 1 and correction:
# Adjust `observed` according to Yates' correction for continuity.
# Magnitude of correction no bigger than difference; see gh-13875
diff = expected - observed
direction = np.sign(diff)
magnitude = np.minimum(0.5, np.abs(diff))
observed = observed + magnitude * direction
chi2, p = power_divergence(observed, expected,
ddof=observed.size - 1 - dof, axis=None,
lambda_=lambda_)
return Chi2ContingencyResult(chi2, p, dof, expected)
def association(observed, method="cramer", correction=False, lambda_=None):
"""Calculates degree of association between two nominal variables.
The function provides the option for computing one of three measures of
association between two nominal variables from the data given in a 2d
contingency table: Tschuprow's T, Pearson's Contingency Coefficient
and Cramer's V.
Parameters
----------
observed : array-like
The array of observed values
method : {"cramer", "tschuprow", "pearson"} (default = "cramer")
The association test statistic.
correction : bool, optional
Inherited from `scipy.stats.contingency.chi2_contingency()`
lambda_ : float or str, optional
Inherited from `scipy.stats.contingency.chi2_contingency()`
Returns
-------
statistic : float
Value of the test statistic
Notes
-----
Cramer's V, Tschuprow's T and Pearson's Contingency Coefficient, all
measure the degree to which two nominal or ordinal variables are related,
or the level of their association. This differs from correlation, although
many often mistakenly consider them equivalent. Correlation measures in
what way two variables are related, whereas, association measures how
related the variables are. As such, association does not subsume
independent variables, and is rather a test of independence. A value of
1.0 indicates perfect association, and 0.0 means the variables have no
association.
Both the Cramer's V and Tschuprow's T are extensions of the phi
coefficient. Moreover, due to the close relationship between the
Cramer's V and Tschuprow's T the returned values can often be similar
or even equivalent. They are likely to diverge more as the array shape
diverges from a 2x2.
References
----------
.. [1] "Tschuprow's T",
https://en.wikipedia.org/wiki/Tschuprow's_T
.. [2] Tschuprow, A. A. (1939)
Principles of the Mathematical Theory of Correlation;
translated by M. Kantorowitsch. W. Hodge & Co.
.. [3] "Cramer's V", https://en.wikipedia.org/wiki/Cramer's_V
.. [4] "Nominal Association: Phi and Cramer's V",
http://www.people.vcu.edu/~pdattalo/702SuppRead/MeasAssoc/NominalAssoc.html
.. [5] Gingrich, Paul, "Association Between Variables",
http://uregina.ca/~gingrich/ch11a.pdf
Examples
--------
An example with a 4x2 contingency table:
>>> import numpy as np
>>> from scipy.stats.contingency import association
>>> obs4x2 = np.array([[100, 150], [203, 322], [420, 700], [320, 210]])
Pearson's contingency coefficient
>>> association(obs4x2, method="pearson")
0.18303298140595667
Cramer's V
>>> association(obs4x2, method="cramer")
0.18617813077483678
Tschuprow's T
>>> association(obs4x2, method="tschuprow")
0.14146478765062995
"""
arr = np.asarray(observed)
if not np.issubdtype(arr.dtype, np.integer):
raise ValueError("`observed` must be an integer array.")
if len(arr.shape) != 2:
raise ValueError("method only accepts 2d arrays")
chi2_stat = chi2_contingency(arr, correction=correction,
lambda_=lambda_)
phi2 = chi2_stat.statistic / arr.sum()
n_rows, n_cols = arr.shape
if method == "cramer":
value = phi2 / min(n_cols - 1, n_rows - 1)
elif method == "tschuprow":
value = phi2 / math.sqrt((n_rows - 1) * (n_cols - 1))
elif method == 'pearson':
value = phi2 / (1 + phi2)
else:
raise ValueError("Invalid argument value: 'method' argument must "
"be 'cramer', 'tschuprow', or 'pearson'")
return math.sqrt(value)

View File

@ -0,0 +1,24 @@
#
# Author: Travis Oliphant 2002-2011 with contributions from
# SciPy Developers 2004-2011
#
# NOTE: To look at history using `git blame`, use `git blame -M -C -C`
# instead of `git blame -Lxxx,+x`.
#
from ._distn_infrastructure import (rv_discrete, rv_continuous, rv_frozen) # noqa: F401
from . import _continuous_distns
from . import _discrete_distns
from ._continuous_distns import * # noqa: F403
from ._levy_stable import levy_stable
from ._discrete_distns import * # noqa: F403
from ._entropy import entropy
# For backwards compatibility e.g. pymc expects distributions.__all__.
__all__ = ['rv_discrete', 'rv_continuous', 'rv_histogram', 'entropy'] # noqa: F405
# Add only the distribution names, not the *_gen names.
__all__ += _continuous_distns._distn_names
__all__ += ['levy_stable']
__all__ += _discrete_distns._distn_names

View File

@ -0,0 +1,18 @@
# This file is not meant for public use and will be removed in SciPy v2.0.0.
# Use the `scipy.stats` namespace for importing the functions
# included below.
from scipy._lib.deprecation import _sub_module_deprecation
__all__ = ["gaussian_kde"] # noqa: F822
def __dir__():
return __all__
def __getattr__(name):
return _sub_module_deprecation(sub_package="stats", module="kde",
private_modules=["_kde"], all=__all__,
attribute=name)

View File

@ -0,0 +1,27 @@
# This file is not meant for public use and will be removed in SciPy v2.0.0.
# Use the `scipy.stats` namespace for importing the functions
# included below.
from scipy._lib.deprecation import _sub_module_deprecation
__all__ = [ # noqa: F822
'mvsdist',
'bayes_mvs', 'kstat', 'kstatvar', 'probplot', 'ppcc_max', 'ppcc_plot',
'boxcox_llf', 'boxcox', 'boxcox_normmax', 'boxcox_normplot',
'shapiro', 'anderson', 'ansari', 'bartlett', 'levene',
'fligner', 'mood', 'wilcoxon', 'median_test',
'circmean', 'circvar', 'circstd', 'anderson_ksamp',
'yeojohnson_llf', 'yeojohnson', 'yeojohnson_normmax',
'yeojohnson_normplot', 'find_repeats', 'chi2_contingency', 'distributions',
]
def __dir__():
return __all__
def __getattr__(name):
return _sub_module_deprecation(sub_package="stats", module="morestats",
private_modules=["_morestats"], all=__all__,
attribute=name)

View File

@ -0,0 +1,140 @@
"""
===================================================================
Statistical functions for masked arrays (:mod:`scipy.stats.mstats`)
===================================================================
.. currentmodule:: scipy.stats.mstats
This module contains a large number of statistical functions that can
be used with masked arrays.
Most of these functions are similar to those in `scipy.stats` but might
have small differences in the API or in the algorithm used. Since this
is a relatively new package, some API changes are still possible.
Summary statistics
==================
.. autosummary::
:toctree: generated/
describe
gmean
hmean
kurtosis
mode
mquantiles
hdmedian
hdquantiles
hdquantiles_sd
idealfourths
plotting_positions
meppf
moment
skew
tmean
tvar
tmin
tmax
tsem
variation
find_repeats
sem
trimmed_mean
trimmed_mean_ci
trimmed_std
trimmed_var
Frequency statistics
====================
.. autosummary::
:toctree: generated/
scoreatpercentile
Correlation functions
=====================
.. autosummary::
:toctree: generated/
f_oneway
pearsonr
spearmanr
pointbiserialr
kendalltau
kendalltau_seasonal
linregress
siegelslopes
theilslopes
sen_seasonal_slopes
Statistical tests
=================
.. autosummary::
:toctree: generated/
ttest_1samp
ttest_onesamp
ttest_ind
ttest_rel
chisquare
kstest
ks_2samp
ks_1samp
ks_twosamp
mannwhitneyu
rankdata
kruskal
kruskalwallis
friedmanchisquare
brunnermunzel
skewtest
kurtosistest
normaltest
Transformations
===============
.. autosummary::
:toctree: generated/
obrientransform
trim
trima
trimmed_stde
trimr
trimtail
trimboth
winsorize
zmap
zscore
Other
=====
.. autosummary::
:toctree: generated/
argstoarray
count_tied_groups
msign
compare_medians_ms
median_cihs
mjci
mquantiles_cimj
rsh
"""
from . import _mstats_basic
from . import _mstats_extras
from ._mstats_basic import * # noqa: F403
from ._mstats_extras import * # noqa: F403
# Functions that support masked array input in stats but need to be kept in the
# mstats namespace for backwards compatibility:
from scipy.stats import gmean, hmean, zmap, zscore, chisquare
__all__ = _mstats_basic.__all__ + _mstats_extras.__all__
__all__ += ['gmean', 'hmean', 'zmap', 'zscore', 'chisquare']

View File

@ -0,0 +1,42 @@
# This file is not meant for public use and will be removed in SciPy v2.0.0.
# Use the `scipy.stats` namespace for importing the functions
# included below.
from scipy._lib.deprecation import _sub_module_deprecation
__all__ = [ # noqa: F822
'argstoarray',
'count_tied_groups',
'describe',
'f_oneway', 'find_repeats','friedmanchisquare',
'kendalltau','kendalltau_seasonal','kruskal','kruskalwallis',
'ks_twosamp', 'ks_2samp', 'kurtosis', 'kurtosistest',
'ks_1samp', 'kstest',
'linregress',
'mannwhitneyu', 'meppf','mode','moment','mquantiles','msign',
'normaltest',
'obrientransform',
'pearsonr','plotting_positions','pointbiserialr',
'rankdata',
'scoreatpercentile','sem',
'sen_seasonal_slopes','skew','skewtest','spearmanr',
'siegelslopes', 'theilslopes',
'tmax','tmean','tmin','trim','trimboth',
'trimtail','trima','trimr','trimmed_mean','trimmed_std',
'trimmed_stde','trimmed_var','tsem','ttest_1samp','ttest_onesamp',
'ttest_ind','ttest_rel','tvar',
'variation',
'winsorize',
'brunnermunzel',
]
def __dir__():
return __all__
def __getattr__(name):
return _sub_module_deprecation(sub_package="stats", module="mstats_basic",
private_modules=["_mstats_basic"], all=__all__,
attribute=name, correct_module="mstats")

View File

@ -0,0 +1,25 @@
# This file is not meant for public use and will be removed in SciPy v2.0.0.
# Use the `scipy.stats` namespace for importing the functions
# included below.
from scipy._lib.deprecation import _sub_module_deprecation
__all__ = [ # noqa: F822
'compare_medians_ms',
'hdquantiles', 'hdmedian', 'hdquantiles_sd',
'idealfourths',
'median_cihs','mjci','mquantiles_cimj',
'rsh',
'trimmed_mean_ci',
]
def __dir__():
return __all__
def __getattr__(name):
return _sub_module_deprecation(sub_package="stats", module="mstats_extras",
private_modules=["_mstats_extras"], all=__all__,
attribute=name, correct_module="mstats")

View File

@ -0,0 +1,17 @@
# This file is not meant for public use and will be removed in SciPy v2.0.0.
# Use the `scipy.stats` namespace for importing the functions
# included below.
from scipy._lib.deprecation import _sub_module_deprecation
__all__: list[str] = []
def __dir__():
return __all__
def __getattr__(name):
return _sub_module_deprecation(sub_package="stats", module="mvn",
private_modules=["_mvn"], all=__all__,
attribute=name)

View File

@ -0,0 +1,236 @@
r"""
====================================================
Quasi-Monte Carlo submodule (:mod:`scipy.stats.qmc`)
====================================================
.. currentmodule:: scipy.stats.qmc
This module provides Quasi-Monte Carlo generators and associated helper
functions.
Quasi-Monte Carlo
=================
Engines
-------
.. autosummary::
:toctree: generated/
QMCEngine
Sobol
Halton
LatinHypercube
PoissonDisk
MultinomialQMC
MultivariateNormalQMC
Helpers
-------
.. autosummary::
:toctree: generated/
discrepancy
geometric_discrepancy
update_discrepancy
scale
Introduction to Quasi-Monte Carlo
=================================
Quasi-Monte Carlo (QMC) methods [1]_, [2]_, [3]_ provide an
:math:`n \times d` array of numbers in :math:`[0,1]`. They can be used in
place of :math:`n` points from the :math:`U[0,1]^{d}` distribution. Compared to
random points, QMC points are designed to have fewer gaps and clumps. This is
quantified by discrepancy measures [4]_. From the Koksma-Hlawka
inequality [5]_ we know that low discrepancy reduces a bound on
integration error. Averaging a function :math:`f` over :math:`n` QMC points
can achieve an integration error close to :math:`O(n^{-1})` for well
behaved functions [2]_.
Most QMC constructions are designed for special values of :math:`n`
such as powers of 2 or large primes. Changing the sample
size by even one can degrade their performance, even their
rate of convergence [6]_. For instance :math:`n=100` points may give less
accuracy than :math:`n=64` if the method was designed for :math:`n=2^m`.
Some QMC constructions are extensible in :math:`n`: we can find
another special sample size :math:`n' > n` and often an infinite
sequence of increasing special sample sizes. Some QMC
constructions are extensible in :math:`d`: we can increase the dimension,
possibly to some upper bound, and typically without requiring
special values of :math:`d`. Some QMC methods are extensible in
both :math:`n` and :math:`d`.
QMC points are deterministic. That makes it hard to estimate the accuracy of
integrals estimated by averages over QMC points. Randomized QMC (RQMC) [7]_
points are constructed so that each point is individually :math:`U[0,1]^{d}`
while collectively the :math:`n` points retain their low discrepancy.
One can make :math:`R` independent replications of RQMC points to
see how stable a computation is. From :math:`R` independent values,
a t-test (or bootstrap t-test [8]_) then gives approximate confidence
intervals on the mean value. Some RQMC methods produce a
root mean squared error that is actually :math:`o(1/n)` and smaller than
the rate seen in unrandomized QMC. An intuitive explanation is
that the error is a sum of many small ones and random errors
cancel in a way that deterministic ones do not. RQMC also
has advantages on integrands that are singular or, for other
reasons, fail to be Riemann integrable.
(R)QMC cannot beat Bahkvalov's curse of dimension (see [9]_). For
any random or deterministic method, there are worst case functions
that will give it poor performance in high dimensions. A worst
case function for QMC might be 0 at all n points but very
large elsewhere. Worst case analyses get very pessimistic
in high dimensions. (R)QMC can bring a great improvement over
MC when the functions on which it is used are not worst case.
For instance (R)QMC can be especially effective on integrands
that are well approximated by sums of functions of
some small number of their input variables at a time [10]_, [11]_.
That property is often a surprising finding about those functions.
Also, to see an improvement over IID MC, (R)QMC requires a bit of smoothness of
the integrand, roughly the mixed first order derivative in each direction,
:math:`\partial^d f/\partial x_1 \cdots \partial x_d`, must be integral.
For instance, a function that is 1 inside the hypersphere and 0 outside of it
has infinite variation in the sense of Hardy and Krause for any dimension
:math:`d = 2`.
Scrambled nets are a kind of RQMC that have some valuable robustness
properties [12]_. If the integrand is square integrable, they give variance
:math:`var_{SNET} = o(1/n)`. There is a finite upper bound on
:math:`var_{SNET} / var_{MC}` that holds simultaneously for every square
integrable integrand. Scrambled nets satisfy a strong law of large numbers
for :math:`f` in :math:`L^p` when :math:`p>1`. In some
special cases there is a central limit theorem [13]_. For smooth enough
integrands they can achieve RMSE nearly :math:`O(n^{-3})`. See [12]_
for references about these properties.
The main kinds of QMC methods are lattice rules [14]_ and digital
nets and sequences [2]_, [15]_. The theories meet up in polynomial
lattice rules [16]_ which can produce digital nets. Lattice rules
require some form of search for good constructions. For digital
nets there are widely used default constructions.
The most widely used QMC methods are Sobol' sequences [17]_.
These are digital nets. They are extensible in both :math:`n` and :math:`d`.
They can be scrambled. The special sample sizes are powers
of 2. Another popular method are Halton sequences [18]_.
The constructions resemble those of digital nets. The earlier
dimensions have much better equidistribution properties than
later ones. There are essentially no special sample sizes.
They are not thought to be as accurate as Sobol' sequences.
They can be scrambled. The nets of Faure [19]_ are also widely
used. All dimensions are equally good, but the special sample
sizes grow rapidly with dimension :math:`d`. They can be scrambled.
The nets of Niederreiter and Xing [20]_ have the best asymptotic
properties but have not shown good empirical performance [21]_.
Higher order digital nets are formed by a digit interleaving process
in the digits of the constructed points. They can achieve higher
levels of asymptotic accuracy given higher smoothness conditions on :math:`f`
and they can be scrambled [22]_. There is little or no empirical work
showing the improved rate to be attained.
Using QMC is like using the entire period of a small random
number generator. The constructions are similar and so
therefore are the computational costs [23]_.
(R)QMC is sometimes improved by passing the points through
a baker's transformation (tent function) prior to using them.
That function has the form :math:`1-2|x-1/2|`. As :math:`x` goes from 0 to
1, this function goes from 0 to 1 and then back. It is very
useful to produce a periodic function for lattice rules [14]_,
and sometimes it improves the convergence rate [24]_.
It is not straightforward to apply QMC methods to Markov
chain Monte Carlo (MCMC). We can think of MCMC as using
:math:`n=1` point in :math:`[0,1]^{d}` for very large :math:`d`, with
ergodic results corresponding to :math:`d \to \infty`. One proposal is
in [25]_ and under strong conditions an improved rate of convergence
has been shown [26]_.
Returning to Sobol' points: there are many versions depending
on what are called direction numbers. Those are the result of
searches and are tabulated. A very widely used set of direction
numbers come from [27]_. It is extensible in dimension up to
:math:`d=21201`.
References
----------
.. [1] Owen, Art B. "Monte Carlo Book: the Quasi-Monte Carlo parts." 2019.
.. [2] Niederreiter, Harald. "Random number generation and quasi-Monte Carlo
methods." Society for Industrial and Applied Mathematics, 1992.
.. [3] Dick, Josef, Frances Y. Kuo, and Ian H. Sloan. "High-dimensional
integration: the quasi-Monte Carlo way." Acta Numerica no. 22: 133, 2013.
.. [4] Aho, A. V., C. Aistleitner, T. Anderson, K. Appel, V. Arnol'd, N.
Aronszajn, D. Asotsky et al. "W. Chen et al.(eds.), "A Panorama of
Discrepancy Theory", Sringer International Publishing,
Switzerland: 679, 2014.
.. [5] Hickernell, Fred J. "Koksma-Hlawka Inequality." Wiley StatsRef:
Statistics Reference Online, 2014.
.. [6] Owen, Art B. "On dropping the first Sobol' point." :arxiv:`2008.08051`,
2020.
.. [7] L'Ecuyer, Pierre, and Christiane Lemieux. "Recent advances in randomized
quasi-Monte Carlo methods." In Modeling uncertainty, pp. 419-474. Springer,
New York, NY, 2002.
.. [8] DiCiccio, Thomas J., and Bradley Efron. "Bootstrap confidence
intervals." Statistical science: 189-212, 1996.
.. [9] Dimov, Ivan T. "Monte Carlo methods for applied scientists." World
Scientific, 2008.
.. [10] Caflisch, Russel E., William J. Morokoff, and Art B. Owen. "Valuation
of mortgage backed securities using Brownian bridges to reduce effective
dimension." Journal of Computational Finance: no. 1 27-46, 1997.
.. [11] Sloan, Ian H., and Henryk Wozniakowski. "When are quasi-Monte Carlo
algorithms efficient for high dimensional integrals?." Journal of Complexity
14, no. 1 (1998): 1-33.
.. [12] Owen, Art B., and Daniel Rudolf, "A strong law of large numbers for
scrambled net integration." SIAM Review, to appear.
.. [13] Loh, Wei-Liem. "On the asymptotic distribution of scrambled net
quadrature." The Annals of Statistics 31, no. 4: 1282-1324, 2003.
.. [14] Sloan, Ian H. and S. Joe. "Lattice methods for multiple integration."
Oxford University Press, 1994.
.. [15] Dick, Josef, and Friedrich Pillichshammer. "Digital nets and sequences:
discrepancy theory and quasi-Monte Carlo integration." Cambridge University
Press, 2010.
.. [16] Dick, Josef, F. Kuo, Friedrich Pillichshammer, and I. Sloan.
"Construction algorithms for polynomial lattice rules for multivariate
integration." Mathematics of computation 74, no. 252: 1895-1921, 2005.
.. [17] Sobol', Il'ya Meerovich. "On the distribution of points in a cube and
the approximate evaluation of integrals." Zhurnal Vychislitel'noi Matematiki
i Matematicheskoi Fiziki 7, no. 4: 784-802, 1967.
.. [18] Halton, John H. "On the efficiency of certain quasi-random sequences of
points in evaluating multi-dimensional integrals." Numerische Mathematik 2,
no. 1: 84-90, 1960.
.. [19] Faure, Henri. "Discrepance de suites associees a un systeme de
numeration (en dimension s)." Acta arithmetica 41, no. 4: 337-351, 1982.
.. [20] Niederreiter, Harold, and Chaoping Xing. "Low-discrepancy sequences and
global function fields with many rational places." Finite Fields and their
applications 2, no. 3: 241-273, 1996.
.. [21] Hong, Hee Sun, and Fred J. Hickernell. "Algorithm 823: Implementing
scrambled digital sequences." ACM Transactions on Mathematical Software
(TOMS) 29, no. 2: 95-109, 2003.
.. [22] Dick, Josef. "Higher order scrambled digital nets achieve the optimal
rate of the root mean square error for smooth integrands." The Annals of
Statistics 39, no. 3: 1372-1398, 2011.
.. [23] Niederreiter, Harald. "Multidimensional numerical integration using
pseudorandom numbers." In Stochastic Programming 84 Part I, pp. 17-38.
Springer, Berlin, Heidelberg, 1986.
.. [24] Hickernell, Fred J. "Obtaining O (N-2+e) Convergence for Lattice
Quadrature Rules." In Monte Carlo and Quasi-Monte Carlo Methods 2000,
pp. 274-289. Springer, Berlin, Heidelberg, 2002.
.. [25] Owen, Art B., and Seth D. Tribble. "A quasi-Monte Carlo Metropolis
algorithm." Proceedings of the National Academy of Sciences 102,
no. 25: 8844-8849, 2005.
.. [26] Chen, Su. "Consistency and convergence rate of Markov chain quasi Monte
Carlo with examples." PhD diss., Stanford University, 2011.
.. [27] Joe, Stephen, and Frances Y. Kuo. "Constructing Sobol sequences with
better two-dimensional projections." SIAM Journal on Scientific Computing
30, no. 5: 2635-2654, 2008.
"""
from ._qmc import * # noqa: F403
from ._qmc import __all__ # noqa: F401

View File

@ -0,0 +1,73 @@
"""
======================================================
Random Number Generators (:mod:`scipy.stats.sampling`)
======================================================
.. currentmodule:: scipy.stats.sampling
This module contains a collection of random number generators to sample
from univariate continuous and discrete distributions. It uses the
implementation of a C library called "UNU.RAN". The only exception is
RatioUniforms, which is a pure Python implementation of the
Ratio-of-Uniforms method.
Generators Wrapped
==================
For continuous distributions
----------------------------
.. autosummary::
:toctree: generated/
NumericalInverseHermite
NumericalInversePolynomial
TransformedDensityRejection
SimpleRatioUniforms
RatioUniforms
For discrete distributions
--------------------------
.. autosummary::
:toctree: generated/
DiscreteAliasUrn
DiscreteGuideTable
Warnings / Errors used in :mod:`scipy.stats.sampling`
-----------------------------------------------------
.. autosummary::
:toctree: generated/
UNURANError
Generators for pre-defined distributions
========================================
To easily apply the above methods for some of the continuous distributions
in :mod:`scipy.stats`, the following functionality can be used:
.. autosummary::
:toctree: generated/
FastGeneratorInversion
"""
from ._sampling import FastGeneratorInversion, RatioUniforms # noqa: F401
from ._unuran.unuran_wrapper import ( # noqa: F401
TransformedDensityRejection,
DiscreteAliasUrn,
DiscreteGuideTable,
NumericalInversePolynomial,
NumericalInverseHermite,
SimpleRatioUniforms,
UNURANError
)
__all__ = ["NumericalInverseHermite", "NumericalInversePolynomial",
"TransformedDensityRejection", "SimpleRatioUniforms",
"RatioUniforms", "DiscreteAliasUrn", "DiscreteGuideTable",
"UNURANError", "FastGeneratorInversion"]

View File

@ -0,0 +1,41 @@
# This file is not meant for public use and will be removed in SciPy v2.0.0.
# Use the `scipy.stats` namespace for importing the functions
# included below.
from scipy._lib.deprecation import _sub_module_deprecation
__all__ = [ # noqa: F822
'find_repeats', 'gmean', 'hmean', 'pmean', 'mode', 'tmean', 'tvar',
'tmin', 'tmax', 'tstd', 'tsem', 'moment',
'skew', 'kurtosis', 'describe', 'skewtest', 'kurtosistest',
'normaltest', 'jarque_bera',
'scoreatpercentile', 'percentileofscore',
'cumfreq', 'relfreq', 'obrientransform',
'sem', 'zmap', 'zscore', 'gzscore', 'iqr', 'gstd',
'median_abs_deviation',
'sigmaclip', 'trimboth', 'trim1', 'trim_mean',
'f_oneway',
'pearsonr', 'fisher_exact',
'spearmanr', 'pointbiserialr',
'kendalltau', 'weightedtau', 'multiscale_graphcorr',
'linregress', 'siegelslopes', 'theilslopes', 'ttest_1samp',
'ttest_ind', 'ttest_ind_from_stats', 'ttest_rel',
'kstest', 'ks_1samp', 'ks_2samp',
'chisquare', 'power_divergence',
'tiecorrect', 'ranksums', 'kruskal', 'friedmanchisquare',
'rankdata',
'combine_pvalues', 'wasserstein_distance', 'energy_distance',
'brunnermunzel', 'alexandergovern', 'distributions',
'mstats_basic',
]
def __dir__():
return __all__
def __getattr__(name):
return _sub_module_deprecation(sub_package="stats", module="stats",
private_modules=["_stats_py", "_mgc"], all=__all__,
attribute=name)

View File

@ -0,0 +1,354 @@
import pickle
import numpy as np
import numpy.testing as npt
from numpy.testing import assert_allclose, assert_equal
from pytest import raises as assert_raises
import numpy.ma.testutils as ma_npt
from scipy._lib._util import (
getfullargspec_no_self as _getfullargspec, np_long
)
from scipy._lib._array_api import xp_assert_equal
from scipy import stats
def check_named_results(res, attributes, ma=False, xp=None):
for i, attr in enumerate(attributes):
if ma:
ma_npt.assert_equal(res[i], getattr(res, attr))
elif xp is not None:
xp_assert_equal(res[i], getattr(res, attr))
else:
npt.assert_equal(res[i], getattr(res, attr))
def check_normalization(distfn, args, distname):
norm_moment = distfn.moment(0, *args)
npt.assert_allclose(norm_moment, 1.0)
if distname == "rv_histogram_instance":
atol, rtol = 1e-5, 0
else:
atol, rtol = 1e-7, 1e-7
normalization_expect = distfn.expect(lambda x: 1, args=args)
npt.assert_allclose(normalization_expect, 1.0, atol=atol, rtol=rtol,
err_msg=distname, verbose=True)
_a, _b = distfn.support(*args)
normalization_cdf = distfn.cdf(_b, *args)
npt.assert_allclose(normalization_cdf, 1.0)
def check_moment(distfn, arg, m, v, msg):
m1 = distfn.moment(1, *arg)
m2 = distfn.moment(2, *arg)
if not np.isinf(m):
npt.assert_almost_equal(m1, m, decimal=10,
err_msg=msg + ' - 1st moment')
else: # or np.isnan(m1),
npt.assert_(np.isinf(m1),
msg + ' - 1st moment -infinite, m1=%s' % str(m1))
if not np.isinf(v):
npt.assert_almost_equal(m2 - m1 * m1, v, decimal=10,
err_msg=msg + ' - 2ndt moment')
else: # or np.isnan(m2),
npt.assert_(np.isinf(m2), msg + f' - 2nd moment -infinite, {m2=}')
def check_mean_expect(distfn, arg, m, msg):
if np.isfinite(m):
m1 = distfn.expect(lambda x: x, arg)
npt.assert_almost_equal(m1, m, decimal=5,
err_msg=msg + ' - 1st moment (expect)')
def check_var_expect(distfn, arg, m, v, msg):
dist_looser_tolerances = {"rv_histogram_instance" , "ksone"}
kwargs = {'rtol': 5e-6} if msg in dist_looser_tolerances else {}
if np.isfinite(v):
m2 = distfn.expect(lambda x: x*x, arg)
npt.assert_allclose(m2, v + m*m, **kwargs)
def check_skew_expect(distfn, arg, m, v, s, msg):
if np.isfinite(s):
m3e = distfn.expect(lambda x: np.power(x-m, 3), arg)
npt.assert_almost_equal(m3e, s * np.power(v, 1.5),
decimal=5, err_msg=msg + ' - skew')
else:
npt.assert_(np.isnan(s))
def check_kurt_expect(distfn, arg, m, v, k, msg):
if np.isfinite(k):
m4e = distfn.expect(lambda x: np.power(x-m, 4), arg)
npt.assert_allclose(m4e, (k + 3.) * np.power(v, 2),
atol=1e-5, rtol=1e-5,
err_msg=msg + ' - kurtosis')
elif not np.isposinf(k):
npt.assert_(np.isnan(k))
def check_munp_expect(dist, args, msg):
# If _munp is overridden, test a higher moment. (Before gh-18634, some
# distributions had issues with moments 5 and higher.)
if dist._munp.__func__ != stats.rv_continuous._munp:
res = dist.moment(5, *args) # shouldn't raise an error
ref = dist.expect(lambda x: x ** 5, args, lb=-np.inf, ub=np.inf)
if not np.isfinite(res): # could be valid; automated test can't know
return
# loose tolerance, mostly to see whether _munp returns *something*
assert_allclose(res, ref, atol=1e-10, rtol=1e-4,
err_msg=msg + ' - higher moment / _munp')
def check_entropy(distfn, arg, msg):
ent = distfn.entropy(*arg)
npt.assert_(not np.isnan(ent), msg + 'test Entropy is nan')
def check_private_entropy(distfn, args, superclass):
# compare a generic _entropy with the distribution-specific implementation
npt.assert_allclose(distfn._entropy(*args),
superclass._entropy(distfn, *args))
def check_entropy_vect_scale(distfn, arg):
# check 2-d
sc = np.asarray([[1, 2], [3, 4]])
v_ent = distfn.entropy(*arg, scale=sc)
s_ent = [distfn.entropy(*arg, scale=s) for s in sc.ravel()]
s_ent = np.asarray(s_ent).reshape(v_ent.shape)
assert_allclose(v_ent, s_ent, atol=1e-14)
# check invalid value, check cast
sc = [1, 2, -3]
v_ent = distfn.entropy(*arg, scale=sc)
s_ent = [distfn.entropy(*arg, scale=s) for s in sc]
s_ent = np.asarray(s_ent).reshape(v_ent.shape)
assert_allclose(v_ent, s_ent, atol=1e-14)
def check_edge_support(distfn, args):
# Make sure that x=self.a and self.b are handled correctly.
x = distfn.support(*args)
if isinstance(distfn, stats.rv_discrete):
x = x[0]-1, x[1]
npt.assert_equal(distfn.cdf(x, *args), [0.0, 1.0])
npt.assert_equal(distfn.sf(x, *args), [1.0, 0.0])
if distfn.name not in ('skellam', 'dlaplace'):
# with a = -inf, log(0) generates warnings
npt.assert_equal(distfn.logcdf(x, *args), [-np.inf, 0.0])
npt.assert_equal(distfn.logsf(x, *args), [0.0, -np.inf])
npt.assert_equal(distfn.ppf([0.0, 1.0], *args), x)
npt.assert_equal(distfn.isf([0.0, 1.0], *args), x[::-1])
# out-of-bounds for isf & ppf
npt.assert_(np.isnan(distfn.isf([-1, 2], *args)).all())
npt.assert_(np.isnan(distfn.ppf([-1, 2], *args)).all())
def check_named_args(distfn, x, shape_args, defaults, meths):
## Check calling w/ named arguments.
# check consistency of shapes, numargs and _parse signature
signature = _getfullargspec(distfn._parse_args)
npt.assert_(signature.varargs is None)
npt.assert_(signature.varkw is None)
npt.assert_(not signature.kwonlyargs)
npt.assert_(list(signature.defaults) == list(defaults))
shape_argnames = signature.args[:-len(defaults)] # a, b, loc=0, scale=1
if distfn.shapes:
shapes_ = distfn.shapes.replace(',', ' ').split()
else:
shapes_ = ''
npt.assert_(len(shapes_) == distfn.numargs)
npt.assert_(len(shapes_) == len(shape_argnames))
# check calling w/ named arguments
shape_args = list(shape_args)
vals = [meth(x, *shape_args) for meth in meths]
npt.assert_(np.all(np.isfinite(vals)))
names, a, k = shape_argnames[:], shape_args[:], {}
while names:
k.update({names.pop(): a.pop()})
v = [meth(x, *a, **k) for meth in meths]
npt.assert_array_equal(vals, v)
if 'n' not in k.keys():
# `n` is first parameter of moment(), so can't be used as named arg
npt.assert_equal(distfn.moment(1, *a, **k),
distfn.moment(1, *shape_args))
# unknown arguments should not go through:
k.update({'kaboom': 42})
assert_raises(TypeError, distfn.cdf, x, **k)
def check_random_state_property(distfn, args):
# check the random_state attribute of a distribution *instance*
# This test fiddles with distfn.random_state. This breaks other tests,
# hence need to save it and then restore.
rndm = distfn.random_state
# baseline: this relies on the global state
np.random.seed(1234)
distfn.random_state = None
r0 = distfn.rvs(*args, size=8)
# use an explicit instance-level random_state
distfn.random_state = 1234
r1 = distfn.rvs(*args, size=8)
npt.assert_equal(r0, r1)
distfn.random_state = np.random.RandomState(1234)
r2 = distfn.rvs(*args, size=8)
npt.assert_equal(r0, r2)
# check that np.random.Generator can be used (numpy >= 1.17)
if hasattr(np.random, 'default_rng'):
# obtain a np.random.Generator object
rng = np.random.default_rng(1234)
distfn.rvs(*args, size=1, random_state=rng)
# can override the instance-level random_state for an individual .rvs call
distfn.random_state = 2
orig_state = distfn.random_state.get_state()
r3 = distfn.rvs(*args, size=8, random_state=np.random.RandomState(1234))
npt.assert_equal(r0, r3)
# ... and that does not alter the instance-level random_state!
npt.assert_equal(distfn.random_state.get_state(), orig_state)
# finally, restore the random_state
distfn.random_state = rndm
def check_meth_dtype(distfn, arg, meths):
q0 = [0.25, 0.5, 0.75]
x0 = distfn.ppf(q0, *arg)
x_cast = [x0.astype(tp) for tp in (np_long, np.float16, np.float32,
np.float64)]
for x in x_cast:
# casting may have clipped the values, exclude those
distfn._argcheck(*arg)
x = x[(distfn.a < x) & (x < distfn.b)]
for meth in meths:
val = meth(x, *arg)
npt.assert_(val.dtype == np.float64)
def check_ppf_dtype(distfn, arg):
q0 = np.asarray([0.25, 0.5, 0.75])
q_cast = [q0.astype(tp) for tp in (np.float16, np.float32, np.float64)]
for q in q_cast:
for meth in [distfn.ppf, distfn.isf]:
val = meth(q, *arg)
npt.assert_(val.dtype == np.float64)
def check_cmplx_deriv(distfn, arg):
# Distributions allow complex arguments.
def deriv(f, x, *arg):
x = np.asarray(x)
h = 1e-10
return (f(x + h*1j, *arg)/h).imag
x0 = distfn.ppf([0.25, 0.51, 0.75], *arg)
x_cast = [x0.astype(tp) for tp in (np_long, np.float16, np.float32,
np.float64)]
for x in x_cast:
# casting may have clipped the values, exclude those
distfn._argcheck(*arg)
x = x[(distfn.a < x) & (x < distfn.b)]
pdf, cdf, sf = distfn.pdf(x, *arg), distfn.cdf(x, *arg), distfn.sf(x, *arg)
assert_allclose(deriv(distfn.cdf, x, *arg), pdf, rtol=1e-5)
assert_allclose(deriv(distfn.logcdf, x, *arg), pdf/cdf, rtol=1e-5)
assert_allclose(deriv(distfn.sf, x, *arg), -pdf, rtol=1e-5)
assert_allclose(deriv(distfn.logsf, x, *arg), -pdf/sf, rtol=1e-5)
assert_allclose(deriv(distfn.logpdf, x, *arg),
deriv(distfn.pdf, x, *arg) / distfn.pdf(x, *arg),
rtol=1e-5)
def check_pickling(distfn, args):
# check that a distribution instance pickles and unpickles
# pay special attention to the random_state property
# save the random_state (restore later)
rndm = distfn.random_state
# check unfrozen
distfn.random_state = 1234
distfn.rvs(*args, size=8)
s = pickle.dumps(distfn)
r0 = distfn.rvs(*args, size=8)
unpickled = pickle.loads(s)
r1 = unpickled.rvs(*args, size=8)
npt.assert_equal(r0, r1)
# also smoke test some methods
medians = [distfn.ppf(0.5, *args), unpickled.ppf(0.5, *args)]
npt.assert_equal(medians[0], medians[1])
npt.assert_equal(distfn.cdf(medians[0], *args),
unpickled.cdf(medians[1], *args))
# check frozen pickling/unpickling with rvs
frozen_dist = distfn(*args)
pkl = pickle.dumps(frozen_dist)
unpickled = pickle.loads(pkl)
r0 = frozen_dist.rvs(size=8)
r1 = unpickled.rvs(size=8)
npt.assert_equal(r0, r1)
# check pickling/unpickling of .fit method
if hasattr(distfn, "fit"):
fit_function = distfn.fit
pickled_fit_function = pickle.dumps(fit_function)
unpickled_fit_function = pickle.loads(pickled_fit_function)
assert fit_function.__name__ == unpickled_fit_function.__name__ == "fit"
# restore the random_state
distfn.random_state = rndm
def check_freezing(distfn, args):
# regression test for gh-11089: freezing a distribution fails
# if loc and/or scale are specified
if isinstance(distfn, stats.rv_continuous):
locscale = {'loc': 1, 'scale': 2}
else:
locscale = {'loc': 1}
rv = distfn(*args, **locscale)
assert rv.a == distfn(*args).a
assert rv.b == distfn(*args).b
def check_rvs_broadcast(distfunc, distname, allargs, shape, shape_only, otype):
np.random.seed(123)
sample = distfunc.rvs(*allargs)
assert_equal(sample.shape, shape, "%s: rvs failed to broadcast" % distname)
if not shape_only:
rvs = np.vectorize(lambda *allargs: distfunc.rvs(*allargs), otypes=otype)
np.random.seed(123)
expected = rvs(*allargs)
assert_allclose(sample, expected, rtol=1e-13)

View File

@ -0,0 +1,171 @@
import math
import numpy as np
from scipy import special
from scipy.stats._qmc import primes_from_2_to
def _primes(n):
# Defined to facilitate comparison between translation and source
# In Matlab, primes(10.5) -> first four primes, primes(11.5) -> first five
return primes_from_2_to(math.ceil(n))
def _gaminv(a, b):
# Defined to facilitate comparison between translation and source
# Matlab's `gaminv` is like `special.gammaincinv` but args are reversed
return special.gammaincinv(b, a)
def _qsimvtv(m, nu, sigma, a, b, rng):
"""Estimates the multivariate t CDF using randomized QMC
Parameters
----------
m : int
The number of points
nu : float
Degrees of freedom
sigma : ndarray
A 2D positive semidefinite covariance matrix
a : ndarray
Lower integration limits
b : ndarray
Upper integration limits.
rng : Generator
Pseudorandom number generator
Returns
-------
p : float
The estimated CDF.
e : float
An absolute error estimate.
"""
# _qsimvtv is a Python translation of the Matlab function qsimvtv,
# semicolons and all.
#
# This function uses an algorithm given in the paper
# "Comparison of Methods for the Numerical Computation of
# Multivariate t Probabilities", in
# J. of Computational and Graphical Stat., 11(2002), pp. 950-971, by
# Alan Genz and Frank Bretz
#
# The primary references for the numerical integration are
# "On a Number-Theoretical Integration Method"
# H. Niederreiter, Aequationes Mathematicae, 8(1972), pp. 304-11.
# and
# "Randomization of Number Theoretic Methods for Multiple Integration"
# R. Cranley & T.N.L. Patterson, SIAM J Numer Anal, 13(1976), pp. 904-14.
#
# Alan Genz is the author of this function and following Matlab functions.
# Alan Genz, WSU Math, PO Box 643113, Pullman, WA 99164-3113
# Email : alangenz@wsu.edu
#
# Copyright (C) 2013, Alan Genz, All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided the following conditions are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
# 3. The contributor name(s) may not be used to endorse or promote
# products derived from this software without specific prior
# written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# Initialization
sn = max(1, math.sqrt(nu)); ch, az, bz = _chlrps(sigma, a/sn, b/sn)
n = len(sigma); N = 10; P = math.ceil(m/N); on = np.ones(P); p = 0; e = 0
ps = np.sqrt(_primes(5*n*math.log(n+4)/4)); q = ps[:, np.newaxis] # Richtmyer gens.
# Randomization loop for ns samples
c = None; dc = None
for S in range(N):
vp = on.copy(); s = np.zeros((n, P))
for i in range(n):
x = np.abs(2*np.mod(q[i]*np.arange(1, P+1) + rng.random(), 1)-1) # periodizing transform
if i == 0:
r = on
if nu > 0:
r = np.sqrt(2*_gaminv(x, nu/2))
else:
y = _Phinv(c + x*dc)
s[i:] += ch[i:, i-1:i] * y
si = s[i, :]; c = on.copy(); ai = az[i]*r - si; d = on.copy(); bi = bz[i]*r - si
c[ai <= -9] = 0; tl = abs(ai) < 9; c[tl] = _Phi(ai[tl])
d[bi <= -9] = 0; tl = abs(bi) < 9; d[tl] = _Phi(bi[tl])
dc = d - c; vp = vp * dc
d = (np.mean(vp) - p)/(S + 1); p = p + d; e = (S - 1)*e/(S + 1) + d**2
e = math.sqrt(e) # error estimate is 3 times std error with N samples.
return p, e
# Standard statistical normal distribution functions
def _Phi(z):
return special.ndtr(z)
def _Phinv(p):
return special.ndtri(p)
def _chlrps(R, a, b):
"""
Computes permuted and scaled lower Cholesky factor c for R which may be
singular, also permuting and scaling integration limit vectors a and b.
"""
ep = 1e-10 # singularity tolerance
eps = np.finfo(R.dtype).eps
n = len(R); c = R.copy(); ap = a.copy(); bp = b.copy(); d = np.sqrt(np.maximum(np.diag(c), 0))
for i in range(n):
if d[i] > 0:
c[:, i] /= d[i]; c[i, :] /= d[i]
ap[i] /= d[i]; bp[i] /= d[i]
y = np.zeros((n, 1)); sqtp = math.sqrt(2*math.pi)
for k in range(n):
im = k; ckk = 0; dem = 1; s = 0
for i in range(k, n):
if c[i, i] > eps:
cii = math.sqrt(max(c[i, i], 0))
if i > 0: s = c[i, :k] @ y[:k]
ai = (ap[i]-s)/cii; bi = (bp[i]-s)/cii; de = _Phi(bi)-_Phi(ai)
if de <= dem:
ckk = cii; dem = de; am = ai; bm = bi; im = i
if im > k:
ap[[im, k]] = ap[[k, im]]; bp[[im, k]] = bp[[k, im]]; c[im, im] = c[k, k]
t = c[im, :k].copy(); c[im, :k] = c[k, :k]; c[k, :k] = t
t = c[im+1:, im].copy(); c[im+1:, im] = c[im+1:, k]; c[im+1:, k] = t
t = c[k+1:im, k].copy(); c[k+1:im, k] = c[im, k+1:im].T; c[im, k+1:im] = t.T
if ckk > ep*(k+1):
c[k, k] = ckk; c[k, k+1:] = 0
for i in range(k+1, n):
c[i, k] = c[i, k]/ckk; c[i, k+1:i+1] = c[i, k+1:i+1] - c[i, k]*c[k+1:i+1, k].T
if abs(dem) > ep:
y[k] = (np.exp(-am**2/2) - np.exp(-bm**2/2)) / (sqtp*dem)
else:
y[k] = (am + bm) / 2
if am < -10:
y[k] = bm
elif bm > 10:
y[k] = am
c[k, :k+1] /= ckk; ap[k] /= ckk; bp[k] /= ckk
else:
c[k:, k] = 0; y[k] = (ap[k] + bp[k])/2
pass
return c, ap, bp

View File

@ -0,0 +1,607 @@
# DO NOT EDIT THIS FILE!
# This file was generated by the R script
# generate_fisher_exact_results_from_r.R
# The script was run with R version 3.6.2 (2019-12-12) at 2020-11-09 06:16:09
from collections import namedtuple
import numpy as np
Inf = np.inf
Parameters = namedtuple('Parameters',
['table', 'confidence_level', 'alternative'])
RResults = namedtuple('RResults',
['pvalue', 'conditional_odds_ratio',
'conditional_odds_ratio_ci'])
data = [
(Parameters(table=[[100, 2], [1000, 5]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=0.1300759363430016,
conditional_odds_ratio=0.25055839934223,
conditional_odds_ratio_ci=(0.04035202926536294,
2.662846672960251))),
(Parameters(table=[[2, 7], [8, 2]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=0.02301413756522116,
conditional_odds_ratio=0.0858623513573622,
conditional_odds_ratio_ci=(0.004668988338943325,
0.895792956493601))),
(Parameters(table=[[5, 1], [10, 10]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=0.1973244147157191,
conditional_odds_ratio=4.725646047336587,
conditional_odds_ratio_ci=(0.4153910882532168,
259.2593661129417))),
(Parameters(table=[[5, 15], [20, 20]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=0.09580440012477633,
conditional_odds_ratio=0.3394396617440851,
conditional_odds_ratio_ci=(0.08056337526385809,
1.22704788545557))),
(Parameters(table=[[5, 16], [16, 25]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=0.2697004098849359,
conditional_odds_ratio=0.4937791394540491,
conditional_odds_ratio_ci=(0.1176691231650079,
1.787463657995973))),
(Parameters(table=[[10, 5], [10, 1]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=0.1973244147157192,
conditional_odds_ratio=0.2116112781158479,
conditional_odds_ratio_ci=(0.003857141267422399,
2.407369893767229))),
(Parameters(table=[[10, 5], [10, 0]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=0.06126482213438735,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
1.451643573543705))),
(Parameters(table=[[5, 0], [1, 4]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=0.04761904761904762,
conditional_odds_ratio=Inf,
conditional_odds_ratio_ci=(1.024822256141754,
Inf))),
(Parameters(table=[[0, 5], [1, 4]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=1,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
39.00054996869288))),
(Parameters(table=[[5, 1], [0, 4]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=0.04761904761904761,
conditional_odds_ratio=Inf,
conditional_odds_ratio_ci=(1.024822256141754,
Inf))),
(Parameters(table=[[0, 1], [3, 2]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=1,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
39.00054996869287))),
(Parameters(table=[[200, 7], [8, 300]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=2.005657880389071e-122,
conditional_odds_ratio=977.7866978606228,
conditional_odds_ratio_ci=(349.2595113327733,
3630.382605689872))),
(Parameters(table=[[28, 21], [6, 1957]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=5.728437460831947e-44,
conditional_odds_ratio=425.2403028434684,
conditional_odds_ratio_ci=(152.4166024390096,
1425.700792178893))),
(Parameters(table=[[190, 800], [200, 900]],
confidence_level=0.95,
alternative='two.sided'),
RResults(pvalue=0.574111858126088,
conditional_odds_ratio=1.068697577856801,
conditional_odds_ratio_ci=(0.8520462587912048,
1.340148950273938))),
(Parameters(table=[[100, 2], [1000, 5]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=0.1300759363430016,
conditional_odds_ratio=0.25055839934223,
conditional_odds_ratio_ci=(0.02502345007115455,
6.304424772117853))),
(Parameters(table=[[2, 7], [8, 2]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=0.02301413756522116,
conditional_odds_ratio=0.0858623513573622,
conditional_odds_ratio_ci=(0.001923034001462487,
1.53670836950172))),
(Parameters(table=[[5, 1], [10, 10]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=0.1973244147157191,
conditional_odds_ratio=4.725646047336587,
conditional_odds_ratio_ci=(0.2397970951413721,
1291.342011095509))),
(Parameters(table=[[5, 15], [20, 20]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=0.09580440012477633,
conditional_odds_ratio=0.3394396617440851,
conditional_odds_ratio_ci=(0.05127576113762925,
1.717176678806983))),
(Parameters(table=[[5, 16], [16, 25]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=0.2697004098849359,
conditional_odds_ratio=0.4937791394540491,
conditional_odds_ratio_ci=(0.07498546954483619,
2.506969905199901))),
(Parameters(table=[[10, 5], [10, 1]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=0.1973244147157192,
conditional_odds_ratio=0.2116112781158479,
conditional_odds_ratio_ci=(0.0007743881879531337,
4.170192301163831))),
(Parameters(table=[[10, 5], [10, 0]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=0.06126482213438735,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
2.642491011905582))),
(Parameters(table=[[5, 0], [1, 4]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=0.04761904761904762,
conditional_odds_ratio=Inf,
conditional_odds_ratio_ci=(0.496935393325443,
Inf))),
(Parameters(table=[[0, 5], [1, 4]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=1,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
198.019801980198))),
(Parameters(table=[[5, 1], [0, 4]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=0.04761904761904761,
conditional_odds_ratio=Inf,
conditional_odds_ratio_ci=(0.496935393325443,
Inf))),
(Parameters(table=[[0, 1], [3, 2]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=1,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
198.019801980198))),
(Parameters(table=[[200, 7], [8, 300]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=2.005657880389071e-122,
conditional_odds_ratio=977.7866978606228,
conditional_odds_ratio_ci=(270.0334165523604,
5461.333333326708))),
(Parameters(table=[[28, 21], [6, 1957]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=5.728437460831947e-44,
conditional_odds_ratio=425.2403028434684,
conditional_odds_ratio_ci=(116.7944750275836,
1931.995993191814))),
(Parameters(table=[[190, 800], [200, 900]],
confidence_level=0.99,
alternative='two.sided'),
RResults(pvalue=0.574111858126088,
conditional_odds_ratio=1.068697577856801,
conditional_odds_ratio_ci=(0.7949398282935892,
1.436229679394333))),
(Parameters(table=[[100, 2], [1000, 5]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=0.1300759363430016,
conditional_odds_ratio=0.25055839934223,
conditional_odds_ratio_ci=(0,
1.797867027270803))),
(Parameters(table=[[2, 7], [8, 2]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=0.0185217259520665,
conditional_odds_ratio=0.0858623513573622,
conditional_odds_ratio_ci=(0,
0.6785254803404526))),
(Parameters(table=[[5, 1], [10, 10]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=0.9782608695652173,
conditional_odds_ratio=4.725646047336587,
conditional_odds_ratio_ci=(0,
127.8497388102893))),
(Parameters(table=[[5, 15], [20, 20]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=0.05625775074399956,
conditional_odds_ratio=0.3394396617440851,
conditional_odds_ratio_ci=(0,
1.032332939718425))),
(Parameters(table=[[5, 16], [16, 25]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=0.1808979350599346,
conditional_odds_ratio=0.4937791394540491,
conditional_odds_ratio_ci=(0,
1.502407513296985))),
(Parameters(table=[[10, 5], [10, 1]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=0.1652173913043479,
conditional_odds_ratio=0.2116112781158479,
conditional_odds_ratio_ci=(0,
1.820421051562392))),
(Parameters(table=[[10, 5], [10, 0]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=0.0565217391304348,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
1.06224603077045))),
(Parameters(table=[[5, 0], [1, 4]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=1,
conditional_odds_ratio=Inf,
conditional_odds_ratio_ci=(0,
Inf))),
(Parameters(table=[[0, 5], [1, 4]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=0.5,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
19.00192394479939))),
(Parameters(table=[[5, 1], [0, 4]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=1,
conditional_odds_ratio=Inf,
conditional_odds_ratio_ci=(0,
Inf))),
(Parameters(table=[[0, 1], [3, 2]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=0.4999999999999999,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
19.00192394479939))),
(Parameters(table=[[200, 7], [8, 300]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=1,
conditional_odds_ratio=977.7866978606228,
conditional_odds_ratio_ci=(0,
3045.460216525746))),
(Parameters(table=[[28, 21], [6, 1957]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=1,
conditional_odds_ratio=425.2403028434684,
conditional_odds_ratio_ci=(0,
1186.440170942579))),
(Parameters(table=[[190, 800], [200, 900]],
confidence_level=0.95,
alternative='less'),
RResults(pvalue=0.7416227010368963,
conditional_odds_ratio=1.068697577856801,
conditional_odds_ratio_ci=(0,
1.293551891610822))),
(Parameters(table=[[100, 2], [1000, 5]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=0.1300759363430016,
conditional_odds_ratio=0.25055839934223,
conditional_odds_ratio_ci=(0,
4.375946050832565))),
(Parameters(table=[[2, 7], [8, 2]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=0.0185217259520665,
conditional_odds_ratio=0.0858623513573622,
conditional_odds_ratio_ci=(0,
1.235282118191202))),
(Parameters(table=[[5, 1], [10, 10]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=0.9782608695652173,
conditional_odds_ratio=4.725646047336587,
conditional_odds_ratio_ci=(0,
657.2063583945989))),
(Parameters(table=[[5, 15], [20, 20]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=0.05625775074399956,
conditional_odds_ratio=0.3394396617440851,
conditional_odds_ratio_ci=(0,
1.498867660683128))),
(Parameters(table=[[5, 16], [16, 25]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=0.1808979350599346,
conditional_odds_ratio=0.4937791394540491,
conditional_odds_ratio_ci=(0,
2.186159386716762))),
(Parameters(table=[[10, 5], [10, 1]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=0.1652173913043479,
conditional_odds_ratio=0.2116112781158479,
conditional_odds_ratio_ci=(0,
3.335351451901569))),
(Parameters(table=[[10, 5], [10, 0]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=0.0565217391304348,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
2.075407697450433))),
(Parameters(table=[[5, 0], [1, 4]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=1,
conditional_odds_ratio=Inf,
conditional_odds_ratio_ci=(0,
Inf))),
(Parameters(table=[[0, 5], [1, 4]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=0.5,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
99.00009507969122))),
(Parameters(table=[[5, 1], [0, 4]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=1,
conditional_odds_ratio=Inf,
conditional_odds_ratio_ci=(0,
Inf))),
(Parameters(table=[[0, 1], [3, 2]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=0.4999999999999999,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
99.00009507969123))),
(Parameters(table=[[200, 7], [8, 300]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=1,
conditional_odds_ratio=977.7866978606228,
conditional_odds_ratio_ci=(0,
4503.078257659934))),
(Parameters(table=[[28, 21], [6, 1957]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=1,
conditional_odds_ratio=425.2403028434684,
conditional_odds_ratio_ci=(0,
1811.766127544222))),
(Parameters(table=[[190, 800], [200, 900]],
confidence_level=0.99,
alternative='less'),
RResults(pvalue=0.7416227010368963,
conditional_odds_ratio=1.068697577856801,
conditional_odds_ratio_ci=(0,
1.396522811516685))),
(Parameters(table=[[100, 2], [1000, 5]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=0.979790445314723,
conditional_odds_ratio=0.25055839934223,
conditional_odds_ratio_ci=(0.05119649909830196,
Inf))),
(Parameters(table=[[2, 7], [8, 2]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=0.9990149169715733,
conditional_odds_ratio=0.0858623513573622,
conditional_odds_ratio_ci=(0.007163749169069961,
Inf))),
(Parameters(table=[[5, 1], [10, 10]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=0.1652173913043478,
conditional_odds_ratio=4.725646047336587,
conditional_odds_ratio_ci=(0.5493234651081089,
Inf))),
(Parameters(table=[[5, 15], [20, 20]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=0.9849086665340765,
conditional_odds_ratio=0.3394396617440851,
conditional_odds_ratio_ci=(0.1003538933958604,
Inf))),
(Parameters(table=[[5, 16], [16, 25]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=0.9330176609214881,
conditional_odds_ratio=0.4937791394540491,
conditional_odds_ratio_ci=(0.146507416280863,
Inf))),
(Parameters(table=[[10, 5], [10, 1]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=0.9782608695652174,
conditional_odds_ratio=0.2116112781158479,
conditional_odds_ratio_ci=(0.007821681994077808,
Inf))),
(Parameters(table=[[10, 5], [10, 0]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=1,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
Inf))),
(Parameters(table=[[5, 0], [1, 4]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=0.02380952380952382,
conditional_odds_ratio=Inf,
conditional_odds_ratio_ci=(1.487678929918272,
Inf))),
(Parameters(table=[[0, 5], [1, 4]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=1,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
Inf))),
(Parameters(table=[[5, 1], [0, 4]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=0.0238095238095238,
conditional_odds_ratio=Inf,
conditional_odds_ratio_ci=(1.487678929918272,
Inf))),
(Parameters(table=[[0, 1], [3, 2]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=1,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
Inf))),
(Parameters(table=[[200, 7], [8, 300]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=2.005657880388915e-122,
conditional_odds_ratio=977.7866978606228,
conditional_odds_ratio_ci=(397.784359748113,
Inf))),
(Parameters(table=[[28, 21], [6, 1957]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=5.728437460831983e-44,
conditional_odds_ratio=425.2403028434684,
conditional_odds_ratio_ci=(174.7148056880929,
Inf))),
(Parameters(table=[[190, 800], [200, 900]],
confidence_level=0.95,
alternative='greater'),
RResults(pvalue=0.2959825901308897,
conditional_odds_ratio=1.068697577856801,
conditional_odds_ratio_ci=(0.8828406663967776,
Inf))),
(Parameters(table=[[100, 2], [1000, 5]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=0.979790445314723,
conditional_odds_ratio=0.25055839934223,
conditional_odds_ratio_ci=(0.03045407081240429,
Inf))),
(Parameters(table=[[2, 7], [8, 2]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=0.9990149169715733,
conditional_odds_ratio=0.0858623513573622,
conditional_odds_ratio_ci=(0.002768053063547901,
Inf))),
(Parameters(table=[[5, 1], [10, 10]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=0.1652173913043478,
conditional_odds_ratio=4.725646047336587,
conditional_odds_ratio_ci=(0.2998184792279909,
Inf))),
(Parameters(table=[[5, 15], [20, 20]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=0.9849086665340765,
conditional_odds_ratio=0.3394396617440851,
conditional_odds_ratio_ci=(0.06180414342643172,
Inf))),
(Parameters(table=[[5, 16], [16, 25]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=0.9330176609214881,
conditional_odds_ratio=0.4937791394540491,
conditional_odds_ratio_ci=(0.09037094010066403,
Inf))),
(Parameters(table=[[10, 5], [10, 1]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=0.9782608695652174,
conditional_odds_ratio=0.2116112781158479,
conditional_odds_ratio_ci=(0.001521592095430679,
Inf))),
(Parameters(table=[[10, 5], [10, 0]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=1,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
Inf))),
(Parameters(table=[[5, 0], [1, 4]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=0.02380952380952382,
conditional_odds_ratio=Inf,
conditional_odds_ratio_ci=(0.6661157890359722,
Inf))),
(Parameters(table=[[0, 5], [1, 4]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=1,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
Inf))),
(Parameters(table=[[5, 1], [0, 4]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=0.0238095238095238,
conditional_odds_ratio=Inf,
conditional_odds_ratio_ci=(0.6661157890359725,
Inf))),
(Parameters(table=[[0, 1], [3, 2]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=1,
conditional_odds_ratio=0,
conditional_odds_ratio_ci=(0,
Inf))),
(Parameters(table=[[200, 7], [8, 300]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=2.005657880388915e-122,
conditional_odds_ratio=977.7866978606228,
conditional_odds_ratio_ci=(297.9619252357688,
Inf))),
(Parameters(table=[[28, 21], [6, 1957]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=5.728437460831983e-44,
conditional_odds_ratio=425.2403028434684,
conditional_odds_ratio_ci=(130.3213490295859,
Inf))),
(Parameters(table=[[190, 800], [200, 900]],
confidence_level=0.99,
alternative='greater'),
RResults(pvalue=0.2959825901308897,
conditional_odds_ratio=1.068697577856801,
conditional_odds_ratio_ci=(0.8176272148267533,
Inf))),
]

View File

@ -0,0 +1,108 @@
NIST/ITL StRD
Dataset Name: AtmWtAg (AtmWtAg.dat)
File Format: ASCII
Certified Values (lines 41 to 47)
Data (lines 61 to 108)
Procedure: Analysis of Variance
Reference: Powell, L.J., Murphy, T.J. and Gramlich, J.W. (1982).
"The Absolute Isotopic Abundance & Atomic Weight
of a Reference Sample of Silver".
NBS Journal of Research, 87, pp. 9-19.
Data: 1 Factor
2 Treatments
24 Replicates/Cell
48 Observations
7 Constant Leading Digits
Average Level of Difficulty
Observed Data
Model: 3 Parameters (mu, tau_1, tau_2)
y_{ij} = mu + tau_i + epsilon_{ij}
Certified Values:
Source of Sums of Mean
Variation df Squares Squares F Statistic
Between Instrument 1 3.63834187500000E-09 3.63834187500000E-09 1.59467335677930E+01
Within Instrument 46 1.04951729166667E-08 2.28155932971014E-10
Certified R-Squared 2.57426544538321E-01
Certified Residual
Standard Deviation 1.51048314446410E-05
Data: Instrument AgWt
1 107.8681568
1 107.8681465
1 107.8681572
1 107.8681785
1 107.8681446
1 107.8681903
1 107.8681526
1 107.8681494
1 107.8681616
1 107.8681587
1 107.8681519
1 107.8681486
1 107.8681419
1 107.8681569
1 107.8681508
1 107.8681672
1 107.8681385
1 107.8681518
1 107.8681662
1 107.8681424
1 107.8681360
1 107.8681333
1 107.8681610
1 107.8681477
2 107.8681079
2 107.8681344
2 107.8681513
2 107.8681197
2 107.8681604
2 107.8681385
2 107.8681642
2 107.8681365
2 107.8681151
2 107.8681082
2 107.8681517
2 107.8681448
2 107.8681198
2 107.8681482
2 107.8681334
2 107.8681609
2 107.8681101
2 107.8681512
2 107.8681469
2 107.8681360
2 107.8681254
2 107.8681261
2 107.8681450
2 107.8681368

View File

@ -0,0 +1,85 @@
NIST/ITL StRD
Dataset Name: SiRstv (SiRstv.dat)
File Format: ASCII
Certified Values (lines 41 to 47)
Data (lines 61 to 85)
Procedure: Analysis of Variance
Reference: Ehrstein, James and Croarkin, M. Carroll.
Unpublished NIST dataset.
Data: 1 Factor
5 Treatments
5 Replicates/Cell
25 Observations
3 Constant Leading Digits
Lower Level of Difficulty
Observed Data
Model: 6 Parameters (mu,tau_1, ... , tau_5)
y_{ij} = mu + tau_i + epsilon_{ij}
Certified Values:
Source of Sums of Mean
Variation df Squares Squares F Statistic
Between Instrument 4 5.11462616000000E-02 1.27865654000000E-02 1.18046237440255E+00
Within Instrument 20 2.16636560000000E-01 1.08318280000000E-02
Certified R-Squared 1.90999039051129E-01
Certified Residual
Standard Deviation 1.04076068334656E-01
Data: Instrument Resistance
1 196.3052
1 196.1240
1 196.1890
1 196.2569
1 196.3403
2 196.3042
2 196.3825
2 196.1669
2 196.3257
2 196.0422
3 196.1303
3 196.2005
3 196.2889
3 196.0343
3 196.1811
4 196.2795
4 196.1748
4 196.1494
4 196.1485
4 195.9885
5 196.2119
5 196.1051
5 196.1850
5 196.0052
5 196.2090

View File

@ -0,0 +1,249 @@
NIST/ITL StRD
Dataset Name: SmLs01 (SmLs01.dat)
File Format: ASCII
Certified Values (lines 41 to 47)
Data (lines 61 to 249)
Procedure: Analysis of Variance
Reference: Simon, Stephen D. and Lesage, James P. (1989).
"Assessing the Accuracy of ANOVA Calculations in
Statistical Software".
Computational Statistics & Data Analysis, 8, pp. 325-332.
Data: 1 Factor
9 Treatments
21 Replicates/Cell
189 Observations
1 Constant Leading Digit
Lower Level of Difficulty
Generated Data
Model: 10 Parameters (mu,tau_1, ... , tau_9)
y_{ij} = mu + tau_i + epsilon_{ij}
Certified Values:
Source of Sums of Mean
Variation df Squares Squares F Statistic
Between Treatment 8 1.68000000000000E+00 2.10000000000000E-01 2.10000000000000E+01
Within Treatment 180 1.80000000000000E+00 1.00000000000000E-02
Certified R-Squared 4.82758620689655E-01
Certified Residual
Standard Deviation 1.00000000000000E-01
Data: Treatment Response
1 1.4
1 1.3
1 1.5
1 1.3
1 1.5
1 1.3
1 1.5
1 1.3
1 1.5
1 1.3
1 1.5
1 1.3
1 1.5
1 1.3
1 1.5
1 1.3
1 1.5
1 1.3
1 1.5
1 1.3
1 1.5
2 1.3
2 1.2
2 1.4
2 1.2
2 1.4
2 1.2
2 1.4
2 1.2
2 1.4
2 1.2
2 1.4
2 1.2
2 1.4
2 1.2
2 1.4
2 1.2
2 1.4
2 1.2
2 1.4
2 1.2
2 1.4
3 1.5
3 1.4
3 1.6
3 1.4
3 1.6
3 1.4
3 1.6
3 1.4
3 1.6
3 1.4
3 1.6
3 1.4
3 1.6
3 1.4
3 1.6
3 1.4
3 1.6
3 1.4
3 1.6
3 1.4
3 1.6
4 1.3
4 1.2
4 1.4
4 1.2
4 1.4
4 1.2
4 1.4
4 1.2
4 1.4
4 1.2
4 1.4
4 1.2
4 1.4
4 1.2
4 1.4
4 1.2
4 1.4
4 1.2
4 1.4
4 1.2
4 1.4
5 1.5
5 1.4
5 1.6
5 1.4
5 1.6
5 1.4
5 1.6
5 1.4
5 1.6
5 1.4
5 1.6
5 1.4
5 1.6
5 1.4
5 1.6
5 1.4
5 1.6
5 1.4
5 1.6
5 1.4
5 1.6
6 1.3
6 1.2
6 1.4
6 1.2
6 1.4
6 1.2
6 1.4
6 1.2
6 1.4
6 1.2
6 1.4
6 1.2
6 1.4
6 1.2
6 1.4
6 1.2
6 1.4
6 1.2
6 1.4
6 1.2
6 1.4
7 1.5
7 1.4
7 1.6
7 1.4
7 1.6
7 1.4
7 1.6
7 1.4
7 1.6
7 1.4
7 1.6
7 1.4
7 1.6
7 1.4
7 1.6
7 1.4
7 1.6
7 1.4
7 1.6
7 1.4
7 1.6
8 1.3
8 1.2
8 1.4
8 1.2
8 1.4
8 1.2
8 1.4
8 1.2
8 1.4
8 1.2
8 1.4
8 1.2
8 1.4
8 1.2
8 1.4
8 1.2
8 1.4
8 1.2
8 1.4
8 1.2
8 1.4
9 1.5
9 1.4
9 1.6
9 1.4
9 1.6
9 1.4
9 1.6
9 1.4
9 1.6
9 1.4
9 1.6
9 1.4
9 1.6
9 1.4
9 1.6
9 1.4
9 1.6
9 1.4
9 1.6
9 1.4
9 1.6

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,249 @@
NIST/ITL StRD
Dataset Name: SmLs04 (SmLs04.dat)
File Format: ASCII
Certified Values (lines 41 to 47)
Data (lines 61 to 249)
Procedure: Analysis of Variance
Reference: Simon, Stephen D. and Lesage, James P. (1989).
"Assessing the Accuracy of ANOVA Calculations in
Statistical Software".
Computational Statistics & Data Analysis, 8, pp. 325-332.
Data: 1 Factor
9 Treatments
21 Replicates/Cell
189 Observations
7 Constant Leading Digits
Average Level of Difficulty
Generated Data
Model: 10 Parameters (mu,tau_1, ... , tau_9)
y_{ij} = mu + tau_i + epsilon_{ij}
Certified Values:
Source of Sums of Mean
Variation df Squares Squares F Statistic
Between Treatment 8 1.68000000000000E+00 2.10000000000000E-01 2.10000000000000E+01
Within Treatment 180 1.80000000000000E+00 1.00000000000000E-02
Certified R-Squared 4.82758620689655E-01
Certified Residual
Standard Deviation 1.00000000000000E-01
Data: Treatment Response
1 1000000.4
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
1 1000000.3
1 1000000.5
2 1000000.3
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
2 1000000.2
2 1000000.4
3 1000000.5
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
3 1000000.4
3 1000000.6
4 1000000.3
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
4 1000000.2
4 1000000.4
5 1000000.5
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
5 1000000.4
5 1000000.6
6 1000000.3
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
6 1000000.2
6 1000000.4
7 1000000.5
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
7 1000000.4
7 1000000.6
8 1000000.3
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
8 1000000.2
8 1000000.4
9 1000000.5
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6
9 1000000.4
9 1000000.6

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,249 @@
NIST/ITL StRD
Dataset Name: SmLs07 (SmLs07.dat)
File Format: ASCII
Certified Values (lines 41 to 47)
Data (lines 61 to 249)
Procedure: Analysis of Variance
Reference: Simon, Stephen D. and Lesage, James P. (1989).
"Assessing the Accuracy of ANOVA Calculations in
Statistical Software".
Computational Statistics & Data Analysis, 8, pp. 325-332.
Data: 1 Factor
9 Treatments
21 Replicates/Cell
189 Observations
13 Constant Leading Digits
Higher Level of Difficulty
Generated Data
Model: 10 Parameters (mu,tau_1, ... , tau_9)
y_{ij} = mu + tau_i + epsilon_{ij}
Certified Values:
Source of Sums of Mean
Variation df Squares Squares F Statistic
Between Treatment 8 1.68000000000000E+00 2.10000000000000E-01 2.10000000000000E+01
Within Treatment 180 1.80000000000000E+00 1.00000000000000E-02
Certified R-Squared 4.82758620689655E-01
Certified Residual
Standard Deviation 1.00000000000000E-01
Data: Treatment Response
1 1000000000000.4
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
1 1000000000000.3
1 1000000000000.5
2 1000000000000.3
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
2 1000000000000.2
2 1000000000000.4
3 1000000000000.5
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
3 1000000000000.4
3 1000000000000.6
4 1000000000000.3
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
4 1000000000000.2
4 1000000000000.4
5 1000000000000.5
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
5 1000000000000.4
5 1000000000000.6
6 1000000000000.3
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
6 1000000000000.2
6 1000000000000.4
7 1000000000000.5
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
7 1000000000000.4
7 1000000000000.6
8 1000000000000.3
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
8 1000000000000.2
8 1000000000000.4
9 1000000000000.5
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6
9 1000000000000.4
9 1000000000000.6

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,97 @@
NIST/ITL StRD
Dataset Name: Norris (Norris.dat)
File Format: ASCII
Certified Values (lines 31 to 46)
Data (lines 61 to 96)
Procedure: Linear Least Squares Regression
Reference: Norris, J., NIST.
Calibration of Ozone Monitors.
Data: 1 Response Variable (y)
1 Predictor Variable (x)
36 Observations
Lower Level of Difficulty
Observed Data
Model: Linear Class
2 Parameters (B0,B1)
y = B0 + B1*x + e
Certified Regression Statistics
Standard Deviation
Parameter Estimate of Estimate
B0 -0.262323073774029 0.232818234301152
B1 1.00211681802045 0.429796848199937E-03
Residual
Standard Deviation 0.884796396144373
R-Squared 0.999993745883712
Certified Analysis of Variance Table
Source of Degrees of Sums of Mean
Variation Freedom Squares Squares F Statistic
Regression 1 4255954.13232369 4255954.13232369 5436385.54079785
Residual 34 26.6173985294224 0.782864662630069
Data: y x
0.1 0.2
338.8 337.4
118.1 118.2
888.0 884.6
9.2 10.1
228.1 226.5
668.5 666.3
998.5 996.3
449.1 448.6
778.9 777.0
559.2 558.2
0.3 0.4
0.1 0.6
778.1 775.5
668.8 666.9
339.3 338.0
448.9 447.5
10.8 11.6
557.7 556.0
228.3 228.1
998.0 995.8
888.8 887.6
119.6 120.2
0.3 0.3
0.6 0.3
557.6 556.8
339.3 339.1
888.0 887.2
998.5 999.0
778.9 779.0
10.2 11.1
117.6 118.3
228.9 229.2
668.4 669.1
449.2 448.9
0.2 0.5

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,568 @@
import numpy as np
from numpy.testing import assert_allclose
import pytest
from pytest import raises as assert_raises
from scipy.stats import (binned_statistic, binned_statistic_2d,
binned_statistic_dd)
from scipy._lib._util import check_random_state
from .common_tests import check_named_results
class TestBinnedStatistic:
@classmethod
def setup_class(cls):
rng = check_random_state(9865)
cls.x = rng.uniform(size=100)
cls.y = rng.uniform(size=100)
cls.v = rng.uniform(size=100)
cls.X = rng.uniform(size=(100, 3))
cls.w = rng.uniform(size=100)
cls.u = rng.uniform(size=100) + 1e6
def test_1d_count(self):
x = self.x
v = self.v
count1, edges1, bc = binned_statistic(x, v, 'count', bins=10)
count2, edges2 = np.histogram(x, bins=10)
assert_allclose(count1, count2)
assert_allclose(edges1, edges2)
def test_gh5927(self):
# smoke test for gh5927 - binned_statistic was using `is` for string
# comparison
x = self.x
v = self.v
statistics = ['mean', 'median', 'count', 'sum']
for statistic in statistics:
binned_statistic(x, v, statistic, bins=10)
def test_big_number_std(self):
# tests for numerical stability of std calculation
# see issue gh-10126 for more
x = self.x
u = self.u
stat1, edges1, bc = binned_statistic(x, u, 'std', bins=10)
stat2, edges2, bc = binned_statistic(x, u, np.std, bins=10)
assert_allclose(stat1, stat2)
def test_empty_bins_std(self):
# tests that std returns gives nan for empty bins
x = self.x
u = self.u
print(binned_statistic(x, u, 'count', bins=1000))
stat1, edges1, bc = binned_statistic(x, u, 'std', bins=1000)
stat2, edges2, bc = binned_statistic(x, u, np.std, bins=1000)
assert_allclose(stat1, stat2)
def test_non_finite_inputs_and_int_bins(self):
# if either `values` or `sample` contain np.inf or np.nan throw
# see issue gh-9010 for more
x = self.x
u = self.u
orig = u[0]
u[0] = np.inf
assert_raises(ValueError, binned_statistic, u, x, 'std', bins=10)
# need to test for non-python specific ints, e.g. np.int8, np.int64
assert_raises(ValueError, binned_statistic, u, x, 'std',
bins=np.int64(10))
u[0] = np.nan
assert_raises(ValueError, binned_statistic, u, x, 'count', bins=10)
# replace original value, u belongs the class
u[0] = orig
def test_1d_result_attributes(self):
x = self.x
v = self.v
res = binned_statistic(x, v, 'count', bins=10)
attributes = ('statistic', 'bin_edges', 'binnumber')
check_named_results(res, attributes)
def test_1d_sum(self):
x = self.x
v = self.v
sum1, edges1, bc = binned_statistic(x, v, 'sum', bins=10)
sum2, edges2 = np.histogram(x, bins=10, weights=v)
assert_allclose(sum1, sum2)
assert_allclose(edges1, edges2)
def test_1d_mean(self):
x = self.x
v = self.v
stat1, edges1, bc = binned_statistic(x, v, 'mean', bins=10)
stat2, edges2, bc = binned_statistic(x, v, np.mean, bins=10)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_1d_std(self):
x = self.x
v = self.v
stat1, edges1, bc = binned_statistic(x, v, 'std', bins=10)
stat2, edges2, bc = binned_statistic(x, v, np.std, bins=10)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_1d_min(self):
x = self.x
v = self.v
stat1, edges1, bc = binned_statistic(x, v, 'min', bins=10)
stat2, edges2, bc = binned_statistic(x, v, np.min, bins=10)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_1d_max(self):
x = self.x
v = self.v
stat1, edges1, bc = binned_statistic(x, v, 'max', bins=10)
stat2, edges2, bc = binned_statistic(x, v, np.max, bins=10)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_1d_median(self):
x = self.x
v = self.v
stat1, edges1, bc = binned_statistic(x, v, 'median', bins=10)
stat2, edges2, bc = binned_statistic(x, v, np.median, bins=10)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_1d_bincode(self):
x = self.x[:20]
v = self.v[:20]
count1, edges1, bc = binned_statistic(x, v, 'count', bins=3)
bc2 = np.array([3, 2, 1, 3, 2, 3, 3, 3, 3, 1, 1, 3, 3, 1, 2, 3, 1,
1, 2, 1])
bcount = [(bc == i).sum() for i in np.unique(bc)]
assert_allclose(bc, bc2)
assert_allclose(bcount, count1)
def test_1d_range_keyword(self):
# Regression test for gh-3063, range can be (min, max) or [(min, max)]
np.random.seed(9865)
x = np.arange(30)
data = np.random.random(30)
mean, bins, _ = binned_statistic(x[:15], data[:15])
mean_range, bins_range, _ = binned_statistic(x, data, range=[(0, 14)])
mean_range2, bins_range2, _ = binned_statistic(x, data, range=(0, 14))
assert_allclose(mean, mean_range)
assert_allclose(bins, bins_range)
assert_allclose(mean, mean_range2)
assert_allclose(bins, bins_range2)
def test_1d_multi_values(self):
x = self.x
v = self.v
w = self.w
stat1v, edges1v, bc1v = binned_statistic(x, v, 'mean', bins=10)
stat1w, edges1w, bc1w = binned_statistic(x, w, 'mean', bins=10)
stat2, edges2, bc2 = binned_statistic(x, [v, w], 'mean', bins=10)
assert_allclose(stat2[0], stat1v)
assert_allclose(stat2[1], stat1w)
assert_allclose(edges1v, edges2)
assert_allclose(bc1v, bc2)
def test_2d_count(self):
x = self.x
y = self.y
v = self.v
count1, binx1, biny1, bc = binned_statistic_2d(
x, y, v, 'count', bins=5)
count2, binx2, biny2 = np.histogram2d(x, y, bins=5)
assert_allclose(count1, count2)
assert_allclose(binx1, binx2)
assert_allclose(biny1, biny2)
def test_2d_result_attributes(self):
x = self.x
y = self.y
v = self.v
res = binned_statistic_2d(x, y, v, 'count', bins=5)
attributes = ('statistic', 'x_edge', 'y_edge', 'binnumber')
check_named_results(res, attributes)
def test_2d_sum(self):
x = self.x
y = self.y
v = self.v
sum1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'sum', bins=5)
sum2, binx2, biny2 = np.histogram2d(x, y, bins=5, weights=v)
assert_allclose(sum1, sum2)
assert_allclose(binx1, binx2)
assert_allclose(biny1, biny2)
def test_2d_mean(self):
x = self.x
y = self.y
v = self.v
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'mean', bins=5)
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.mean, bins=5)
assert_allclose(stat1, stat2)
assert_allclose(binx1, binx2)
assert_allclose(biny1, biny2)
def test_2d_mean_unicode(self):
x = self.x
y = self.y
v = self.v
stat1, binx1, biny1, bc = binned_statistic_2d(
x, y, v, 'mean', bins=5)
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.mean, bins=5)
assert_allclose(stat1, stat2)
assert_allclose(binx1, binx2)
assert_allclose(biny1, biny2)
def test_2d_std(self):
x = self.x
y = self.y
v = self.v
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'std', bins=5)
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.std, bins=5)
assert_allclose(stat1, stat2)
assert_allclose(binx1, binx2)
assert_allclose(biny1, biny2)
def test_2d_min(self):
x = self.x
y = self.y
v = self.v
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'min', bins=5)
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.min, bins=5)
assert_allclose(stat1, stat2)
assert_allclose(binx1, binx2)
assert_allclose(biny1, biny2)
def test_2d_max(self):
x = self.x
y = self.y
v = self.v
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'max', bins=5)
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.max, bins=5)
assert_allclose(stat1, stat2)
assert_allclose(binx1, binx2)
assert_allclose(biny1, biny2)
def test_2d_median(self):
x = self.x
y = self.y
v = self.v
stat1, binx1, biny1, bc = binned_statistic_2d(
x, y, v, 'median', bins=5)
stat2, binx2, biny2, bc = binned_statistic_2d(
x, y, v, np.median, bins=5)
assert_allclose(stat1, stat2)
assert_allclose(binx1, binx2)
assert_allclose(biny1, biny2)
def test_2d_bincode(self):
x = self.x[:20]
y = self.y[:20]
v = self.v[:20]
count1, binx1, biny1, bc = binned_statistic_2d(
x, y, v, 'count', bins=3)
bc2 = np.array([17, 11, 6, 16, 11, 17, 18, 17, 17, 7, 6, 18, 16,
6, 11, 16, 6, 6, 11, 8])
bcount = [(bc == i).sum() for i in np.unique(bc)]
assert_allclose(bc, bc2)
count1adj = count1[count1.nonzero()]
assert_allclose(bcount, count1adj)
def test_2d_multi_values(self):
x = self.x
y = self.y
v = self.v
w = self.w
stat1v, binx1v, biny1v, bc1v = binned_statistic_2d(
x, y, v, 'mean', bins=8)
stat1w, binx1w, biny1w, bc1w = binned_statistic_2d(
x, y, w, 'mean', bins=8)
stat2, binx2, biny2, bc2 = binned_statistic_2d(
x, y, [v, w], 'mean', bins=8)
assert_allclose(stat2[0], stat1v)
assert_allclose(stat2[1], stat1w)
assert_allclose(binx1v, binx2)
assert_allclose(biny1w, biny2)
assert_allclose(bc1v, bc2)
def test_2d_binnumbers_unraveled(self):
x = self.x
y = self.y
v = self.v
stat, edgesx, bcx = binned_statistic(x, v, 'mean', bins=20)
stat, edgesy, bcy = binned_statistic(y, v, 'mean', bins=10)
stat2, edgesx2, edgesy2, bc2 = binned_statistic_2d(
x, y, v, 'mean', bins=(20, 10), expand_binnumbers=True)
bcx3 = np.searchsorted(edgesx, x, side='right')
bcy3 = np.searchsorted(edgesy, y, side='right')
# `numpy.searchsorted` is non-inclusive on right-edge, compensate
bcx3[x == x.max()] -= 1
bcy3[y == y.max()] -= 1
assert_allclose(bcx, bc2[0])
assert_allclose(bcy, bc2[1])
assert_allclose(bcx3, bc2[0])
assert_allclose(bcy3, bc2[1])
def test_dd_count(self):
X = self.X
v = self.v
count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3)
count2, edges2 = np.histogramdd(X, bins=3)
assert_allclose(count1, count2)
assert_allclose(edges1, edges2)
def test_dd_result_attributes(self):
X = self.X
v = self.v
res = binned_statistic_dd(X, v, 'count', bins=3)
attributes = ('statistic', 'bin_edges', 'binnumber')
check_named_results(res, attributes)
def test_dd_sum(self):
X = self.X
v = self.v
sum1, edges1, bc = binned_statistic_dd(X, v, 'sum', bins=3)
sum2, edges2 = np.histogramdd(X, bins=3, weights=v)
sum3, edges3, bc = binned_statistic_dd(X, v, np.sum, bins=3)
assert_allclose(sum1, sum2)
assert_allclose(edges1, edges2)
assert_allclose(sum1, sum3)
assert_allclose(edges1, edges3)
def test_dd_mean(self):
X = self.X
v = self.v
stat1, edges1, bc = binned_statistic_dd(X, v, 'mean', bins=3)
stat2, edges2, bc = binned_statistic_dd(X, v, np.mean, bins=3)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_dd_std(self):
X = self.X
v = self.v
stat1, edges1, bc = binned_statistic_dd(X, v, 'std', bins=3)
stat2, edges2, bc = binned_statistic_dd(X, v, np.std, bins=3)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_dd_min(self):
X = self.X
v = self.v
stat1, edges1, bc = binned_statistic_dd(X, v, 'min', bins=3)
stat2, edges2, bc = binned_statistic_dd(X, v, np.min, bins=3)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_dd_max(self):
X = self.X
v = self.v
stat1, edges1, bc = binned_statistic_dd(X, v, 'max', bins=3)
stat2, edges2, bc = binned_statistic_dd(X, v, np.max, bins=3)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_dd_median(self):
X = self.X
v = self.v
stat1, edges1, bc = binned_statistic_dd(X, v, 'median', bins=3)
stat2, edges2, bc = binned_statistic_dd(X, v, np.median, bins=3)
assert_allclose(stat1, stat2)
assert_allclose(edges1, edges2)
def test_dd_bincode(self):
X = self.X[:20]
v = self.v[:20]
count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3)
bc2 = np.array([63, 33, 86, 83, 88, 67, 57, 33, 42, 41, 82, 83, 92,
32, 36, 91, 43, 87, 81, 81])
bcount = [(bc == i).sum() for i in np.unique(bc)]
assert_allclose(bc, bc2)
count1adj = count1[count1.nonzero()]
assert_allclose(bcount, count1adj)
def test_dd_multi_values(self):
X = self.X
v = self.v
w = self.w
for stat in ["count", "sum", "mean", "std", "min", "max", "median",
np.std]:
stat1v, edges1v, bc1v = binned_statistic_dd(X, v, stat, bins=8)
stat1w, edges1w, bc1w = binned_statistic_dd(X, w, stat, bins=8)
stat2, edges2, bc2 = binned_statistic_dd(X, [v, w], stat, bins=8)
assert_allclose(stat2[0], stat1v)
assert_allclose(stat2[1], stat1w)
assert_allclose(edges1v, edges2)
assert_allclose(edges1w, edges2)
assert_allclose(bc1v, bc2)
def test_dd_binnumbers_unraveled(self):
X = self.X
v = self.v
stat, edgesx, bcx = binned_statistic(X[:, 0], v, 'mean', bins=15)
stat, edgesy, bcy = binned_statistic(X[:, 1], v, 'mean', bins=20)
stat, edgesz, bcz = binned_statistic(X[:, 2], v, 'mean', bins=10)
stat2, edges2, bc2 = binned_statistic_dd(
X, v, 'mean', bins=(15, 20, 10), expand_binnumbers=True)
assert_allclose(bcx, bc2[0])
assert_allclose(bcy, bc2[1])
assert_allclose(bcz, bc2[2])
def test_dd_binned_statistic_result(self):
# NOTE: tests the reuse of bin_edges from previous call
x = np.random.random((10000, 3))
v = np.random.random(10000)
bins = np.linspace(0, 1, 10)
bins = (bins, bins, bins)
result = binned_statistic_dd(x, v, 'mean', bins=bins)
stat = result.statistic
result = binned_statistic_dd(x, v, 'mean',
binned_statistic_result=result)
stat2 = result.statistic
assert_allclose(stat, stat2)
def test_dd_zero_dedges(self):
x = np.random.random((10000, 3))
v = np.random.random(10000)
bins = np.linspace(0, 1, 10)
bins = np.append(bins, 1)
bins = (bins, bins, bins)
with assert_raises(ValueError, match='difference is numerically 0'):
binned_statistic_dd(x, v, 'mean', bins=bins)
def test_dd_range_errors(self):
# Test that descriptive exceptions are raised as appropriate for bad
# values of the `range` argument. (See gh-12996)
with assert_raises(ValueError,
match='In range, start must be <= stop'):
binned_statistic_dd([self.y], self.v,
range=[[1, 0]])
with assert_raises(
ValueError,
match='In dimension 1 of range, start must be <= stop'):
binned_statistic_dd([self.x, self.y], self.v,
range=[[1, 0], [0, 1]])
with assert_raises(
ValueError,
match='In dimension 2 of range, start must be <= stop'):
binned_statistic_dd([self.x, self.y], self.v,
range=[[0, 1], [1, 0]])
with assert_raises(
ValueError,
match='range given for 1 dimensions; 2 required'):
binned_statistic_dd([self.x, self.y], self.v,
range=[[0, 1]])
def test_binned_statistic_float32(self):
X = np.array([0, 0.42358226], dtype=np.float32)
stat, _, _ = binned_statistic(X, None, 'count', bins=5)
assert_allclose(stat, np.array([1, 0, 0, 0, 1], dtype=np.float64))
def test_gh14332(self):
# Test the wrong output when the `sample` is close to bin edge
x = []
size = 20
for i in range(size):
x += [1-0.1**i]
bins = np.linspace(0,1,11)
sum1, edges1, bc = binned_statistic_dd(x, np.ones(len(x)),
bins=[bins], statistic='sum')
sum2, edges2 = np.histogram(x, bins=bins)
assert_allclose(sum1, sum2)
assert_allclose(edges1[0], edges2)
@pytest.mark.parametrize("dtype", [np.float64, np.complex128])
@pytest.mark.parametrize("statistic", [np.mean, np.median, np.sum, np.std,
np.min, np.max, 'count',
lambda x: (x**2).sum(),
lambda x: (x**2).sum() * 1j])
def test_dd_all(self, dtype, statistic):
def ref_statistic(x):
return len(x) if statistic == 'count' else statistic(x)
rng = np.random.default_rng(3704743126639371)
n = 10
x = rng.random(size=n)
i = x >= 0.5
v = rng.random(size=n)
if dtype is np.complex128:
v = v + rng.random(size=n)*1j
stat, _, _ = binned_statistic_dd(x, v, statistic, bins=2)
ref = np.array([ref_statistic(v[~i]), ref_statistic(v[i])])
assert_allclose(stat, ref)
assert stat.dtype == np.result_type(ref.dtype, np.float64)

View File

@ -0,0 +1,152 @@
# Tests for the CensoredData class.
import pytest
import numpy as np
from numpy.testing import assert_equal, assert_array_equal
from scipy.stats import CensoredData
class TestCensoredData:
def test_basic(self):
uncensored = [1]
left = [0]
right = [2, 5]
interval = [[2, 3]]
data = CensoredData(uncensored, left=left, right=right,
interval=interval)
assert_equal(data._uncensored, uncensored)
assert_equal(data._left, left)
assert_equal(data._right, right)
assert_equal(data._interval, interval)
udata = data._uncensor()
assert_equal(udata, np.concatenate((uncensored, left, right,
np.mean(interval, axis=1))))
def test_right_censored(self):
x = np.array([0, 3, 2.5])
is_censored = np.array([0, 1, 0], dtype=bool)
data = CensoredData.right_censored(x, is_censored)
assert_equal(data._uncensored, x[~is_censored])
assert_equal(data._right, x[is_censored])
assert_equal(data._left, [])
assert_equal(data._interval, np.empty((0, 2)))
def test_left_censored(self):
x = np.array([0, 3, 2.5])
is_censored = np.array([0, 1, 0], dtype=bool)
data = CensoredData.left_censored(x, is_censored)
assert_equal(data._uncensored, x[~is_censored])
assert_equal(data._left, x[is_censored])
assert_equal(data._right, [])
assert_equal(data._interval, np.empty((0, 2)))
def test_interval_censored_basic(self):
a = [0.5, 2.0, 3.0, 5.5]
b = [1.0, 2.5, 3.5, 7.0]
data = CensoredData.interval_censored(low=a, high=b)
assert_array_equal(data._interval, np.array(list(zip(a, b))))
assert data._uncensored.shape == (0,)
assert data._left.shape == (0,)
assert data._right.shape == (0,)
def test_interval_censored_mixed(self):
# This is actually a mix of uncensored, left-censored, right-censored
# and interval-censored data. Check that when the `interval_censored`
# class method is used, the data is correctly separated into the
# appropriate arrays.
a = [0.5, -np.inf, -13.0, 2.0, 1.0, 10.0, -1.0]
b = [0.5, 2500.0, np.inf, 3.0, 1.0, 11.0, np.inf]
data = CensoredData.interval_censored(low=a, high=b)
assert_array_equal(data._interval, [[2.0, 3.0], [10.0, 11.0]])
assert_array_equal(data._uncensored, [0.5, 1.0])
assert_array_equal(data._left, [2500.0])
assert_array_equal(data._right, [-13.0, -1.0])
def test_interval_to_other_types(self):
# The interval parameter can represent uncensored and
# left- or right-censored data. Test the conversion of such
# an example to the canonical form in which the different
# types have been split into the separate arrays.
interval = np.array([[0, 1], # interval-censored
[2, 2], # not censored
[3, 3], # not censored
[9, np.inf], # right-censored
[8, np.inf], # right-censored
[-np.inf, 0], # left-censored
[1, 2]]) # interval-censored
data = CensoredData(interval=interval)
assert_equal(data._uncensored, [2, 3])
assert_equal(data._left, [0])
assert_equal(data._right, [9, 8])
assert_equal(data._interval, [[0, 1], [1, 2]])
def test_empty_arrays(self):
data = CensoredData(uncensored=[], left=[], right=[], interval=[])
assert data._uncensored.shape == (0,)
assert data._left.shape == (0,)
assert data._right.shape == (0,)
assert data._interval.shape == (0, 2)
assert len(data) == 0
def test_invalid_constructor_args(self):
with pytest.raises(ValueError, match='must be a one-dimensional'):
CensoredData(uncensored=[[1, 2, 3]])
with pytest.raises(ValueError, match='must be a one-dimensional'):
CensoredData(left=[[1, 2, 3]])
with pytest.raises(ValueError, match='must be a one-dimensional'):
CensoredData(right=[[1, 2, 3]])
with pytest.raises(ValueError, match='must be a two-dimensional'):
CensoredData(interval=[[1, 2, 3]])
with pytest.raises(ValueError, match='must not contain nan'):
CensoredData(uncensored=[1, np.nan, 2])
with pytest.raises(ValueError, match='must not contain nan'):
CensoredData(left=[1, np.nan, 2])
with pytest.raises(ValueError, match='must not contain nan'):
CensoredData(right=[1, np.nan, 2])
with pytest.raises(ValueError, match='must not contain nan'):
CensoredData(interval=[[1, np.nan], [2, 3]])
with pytest.raises(ValueError,
match='both values must not be infinite'):
CensoredData(interval=[[1, 3], [2, 9], [np.inf, np.inf]])
with pytest.raises(ValueError,
match='left value must not exceed the right'):
CensoredData(interval=[[1, 0], [2, 2]])
@pytest.mark.parametrize('func', [CensoredData.left_censored,
CensoredData.right_censored])
def test_invalid_left_right_censored_args(self, func):
with pytest.raises(ValueError,
match='`x` must be one-dimensional'):
func([[1, 2, 3]], [0, 1, 1])
with pytest.raises(ValueError,
match='`censored` must be one-dimensional'):
func([1, 2, 3], [[0, 1, 1]])
with pytest.raises(ValueError, match='`x` must not contain'):
func([1, 2, np.nan], [0, 1, 1])
with pytest.raises(ValueError, match='must have the same length'):
func([1, 2, 3], [0, 0, 1, 1])
def test_invalid_censored_args(self):
with pytest.raises(ValueError,
match='`low` must be a one-dimensional'):
CensoredData.interval_censored(low=[[3]], high=[4, 5])
with pytest.raises(ValueError,
match='`high` must be a one-dimensional'):
CensoredData.interval_censored(low=[3], high=[[4, 5]])
with pytest.raises(ValueError, match='`low` must not contain'):
CensoredData.interval_censored([1, 2, np.nan], [0, 1, 1])
with pytest.raises(ValueError, match='must have the same length'):
CensoredData.interval_censored([1, 2, 3], [0, 0, 1, 1])
def test_count_censored(self):
x = [1, 2, 3]
# data1 has no censored data.
data1 = CensoredData(x)
assert data1.num_censored() == 0
data2 = CensoredData(uncensored=[2.5], left=[10], interval=[[0, 1]])
assert data2.num_censored() == 2

View File

@ -0,0 +1,241 @@
import numpy as np
from numpy.testing import (assert_equal, assert_array_equal,
assert_array_almost_equal, assert_approx_equal,
assert_allclose)
import pytest
from pytest import raises as assert_raises
from scipy.special import xlogy
from scipy.stats.contingency import (margins, expected_freq,
chi2_contingency, association)
def test_margins():
a = np.array([1])
m = margins(a)
assert_equal(len(m), 1)
m0 = m[0]
assert_array_equal(m0, np.array([1]))
a = np.array([[1]])
m0, m1 = margins(a)
expected0 = np.array([[1]])
expected1 = np.array([[1]])
assert_array_equal(m0, expected0)
assert_array_equal(m1, expected1)
a = np.arange(12).reshape(2, 6)
m0, m1 = margins(a)
expected0 = np.array([[15], [51]])
expected1 = np.array([[6, 8, 10, 12, 14, 16]])
assert_array_equal(m0, expected0)
assert_array_equal(m1, expected1)
a = np.arange(24).reshape(2, 3, 4)
m0, m1, m2 = margins(a)
expected0 = np.array([[[66]], [[210]]])
expected1 = np.array([[[60], [92], [124]]])
expected2 = np.array([[[60, 66, 72, 78]]])
assert_array_equal(m0, expected0)
assert_array_equal(m1, expected1)
assert_array_equal(m2, expected2)
def test_expected_freq():
assert_array_equal(expected_freq([1]), np.array([1.0]))
observed = np.array([[[2, 0], [0, 2]], [[0, 2], [2, 0]], [[1, 1], [1, 1]]])
e = expected_freq(observed)
assert_array_equal(e, np.ones_like(observed))
observed = np.array([[10, 10, 20], [20, 20, 20]])
e = expected_freq(observed)
correct = np.array([[12., 12., 16.], [18., 18., 24.]])
assert_array_almost_equal(e, correct)
def test_chi2_contingency_trivial():
# Some very simple tests for chi2_contingency.
# A trivial case
obs = np.array([[1, 2], [1, 2]])
chi2, p, dof, expected = chi2_contingency(obs, correction=False)
assert_equal(chi2, 0.0)
assert_equal(p, 1.0)
assert_equal(dof, 1)
assert_array_equal(obs, expected)
# A *really* trivial case: 1-D data.
obs = np.array([1, 2, 3])
chi2, p, dof, expected = chi2_contingency(obs, correction=False)
assert_equal(chi2, 0.0)
assert_equal(p, 1.0)
assert_equal(dof, 0)
assert_array_equal(obs, expected)
def test_chi2_contingency_R():
# Some test cases that were computed independently, using R.
# Rcode = \
# """
# # Data vector.
# data <- c(
# 12, 34, 23, 4, 47, 11,
# 35, 31, 11, 34, 10, 18,
# 12, 32, 9, 18, 13, 19,
# 12, 12, 14, 9, 33, 25
# )
#
# # Create factor tags:r=rows, c=columns, t=tiers
# r <- factor(gl(4, 2*3, 2*3*4, labels=c("r1", "r2", "r3", "r4")))
# c <- factor(gl(3, 1, 2*3*4, labels=c("c1", "c2", "c3")))
# t <- factor(gl(2, 3, 2*3*4, labels=c("t1", "t2")))
#
# # 3-way Chi squared test of independence
# s = summary(xtabs(data~r+c+t))
# print(s)
# """
# Routput = \
# """
# Call: xtabs(formula = data ~ r + c + t)
# Number of cases in table: 478
# Number of factors: 3
# Test for independence of all factors:
# Chisq = 102.17, df = 17, p-value = 3.514e-14
# """
obs = np.array(
[[[12, 34, 23],
[35, 31, 11],
[12, 32, 9],
[12, 12, 14]],
[[4, 47, 11],
[34, 10, 18],
[18, 13, 19],
[9, 33, 25]]])
chi2, p, dof, expected = chi2_contingency(obs)
assert_approx_equal(chi2, 102.17, significant=5)
assert_approx_equal(p, 3.514e-14, significant=4)
assert_equal(dof, 17)
# Rcode = \
# """
# # Data vector.
# data <- c(
# #
# 12, 17,
# 11, 16,
# #
# 11, 12,
# 15, 16,
# #
# 23, 15,
# 30, 22,
# #
# 14, 17,
# 15, 16
# )
#
# # Create factor tags:r=rows, c=columns, d=depths(?), t=tiers
# r <- factor(gl(2, 2, 2*2*2*2, labels=c("r1", "r2")))
# c <- factor(gl(2, 1, 2*2*2*2, labels=c("c1", "c2")))
# d <- factor(gl(2, 4, 2*2*2*2, labels=c("d1", "d2")))
# t <- factor(gl(2, 8, 2*2*2*2, labels=c("t1", "t2")))
#
# # 4-way Chi squared test of independence
# s = summary(xtabs(data~r+c+d+t))
# print(s)
# """
# Routput = \
# """
# Call: xtabs(formula = data ~ r + c + d + t)
# Number of cases in table: 262
# Number of factors: 4
# Test for independence of all factors:
# Chisq = 8.758, df = 11, p-value = 0.6442
# """
obs = np.array(
[[[[12, 17],
[11, 16]],
[[11, 12],
[15, 16]]],
[[[23, 15],
[30, 22]],
[[14, 17],
[15, 16]]]])
chi2, p, dof, expected = chi2_contingency(obs)
assert_approx_equal(chi2, 8.758, significant=4)
assert_approx_equal(p, 0.6442, significant=4)
assert_equal(dof, 11)
def test_chi2_contingency_g():
c = np.array([[15, 60], [15, 90]])
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood',
correction=False)
assert_allclose(g, 2*xlogy(c, c/e).sum())
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood',
correction=True)
c_corr = c + np.array([[-0.5, 0.5], [0.5, -0.5]])
assert_allclose(g, 2*xlogy(c_corr, c_corr/e).sum())
c = np.array([[10, 12, 10], [12, 10, 10]])
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood')
assert_allclose(g, 2*xlogy(c, c/e).sum())
def test_chi2_contingency_bad_args():
# Test that "bad" inputs raise a ValueError.
# Negative value in the array of observed frequencies.
obs = np.array([[-1, 10], [1, 2]])
assert_raises(ValueError, chi2_contingency, obs)
# The zeros in this will result in zeros in the array
# of expected frequencies.
obs = np.array([[0, 1], [0, 1]])
assert_raises(ValueError, chi2_contingency, obs)
# A degenerate case: `observed` has size 0.
obs = np.empty((0, 8))
assert_raises(ValueError, chi2_contingency, obs)
def test_chi2_contingency_yates_gh13875():
# Magnitude of Yates' continuity correction should not exceed difference
# between expected and observed value of the statistic; see gh-13875
observed = np.array([[1573, 3], [4, 0]])
p = chi2_contingency(observed)[1]
assert_allclose(p, 1, rtol=1e-12)
@pytest.mark.parametrize("correction", [False, True])
def test_result(correction):
obs = np.array([[1, 2], [1, 2]])
res = chi2_contingency(obs, correction=correction)
assert_equal((res.statistic, res.pvalue, res.dof, res.expected_freq), res)
def test_bad_association_args():
# Invalid Test Statistic
assert_raises(ValueError, association, [[1, 2], [3, 4]], "X")
# Invalid array shape
assert_raises(ValueError, association, [[[1, 2]], [[3, 4]]], "cramer")
# chi2_contingency exception
assert_raises(ValueError, association, [[-1, 10], [1, 2]], 'cramer')
# Invalid Array Item Data Type
assert_raises(ValueError, association,
np.array([[1, 2], ["dd", 4]], dtype=object), 'cramer')
@pytest.mark.parametrize('stat, expected',
[('cramer', 0.09222412010290792),
('tschuprow', 0.0775509319944633),
('pearson', 0.12932925727138758)])
def test_assoc(stat, expected):
# 2d Array
obs1 = np.array([[12, 13, 14, 15, 16],
[17, 16, 18, 19, 11],
[9, 15, 14, 12, 11]])
a = association(observed=obs1, method=stat)
assert_allclose(a, expected)

Some files were not shown because too many files have changed in this diff Show More