asd
This commit is contained in:
649
venv/lib/python3.12/site-packages/scipy/stats/__init__.py
Normal file
649
venv/lib/python3.12/site-packages/scipy/stats/__init__.py
Normal file
@ -0,0 +1,649 @@
|
||||
"""
|
||||
.. _statsrefmanual:
|
||||
|
||||
==========================================
|
||||
Statistical functions (:mod:`scipy.stats`)
|
||||
==========================================
|
||||
|
||||
.. currentmodule:: scipy.stats
|
||||
|
||||
This module contains a large number of probability distributions,
|
||||
summary and frequency statistics, correlation functions and statistical
|
||||
tests, masked statistics, kernel density estimation, quasi-Monte Carlo
|
||||
functionality, and more.
|
||||
|
||||
Statistics is a very large area, and there are topics that are out of scope
|
||||
for SciPy and are covered by other packages. Some of the most important ones
|
||||
are:
|
||||
|
||||
- `statsmodels <https://www.statsmodels.org/stable/index.html>`__:
|
||||
regression, linear models, time series analysis, extensions to topics
|
||||
also covered by ``scipy.stats``.
|
||||
- `Pandas <https://pandas.pydata.org/>`__: tabular data, time series
|
||||
functionality, interfaces to other statistical languages.
|
||||
- `PyMC <https://docs.pymc.io/>`__: Bayesian statistical
|
||||
modeling, probabilistic machine learning.
|
||||
- `scikit-learn <https://scikit-learn.org/>`__: classification, regression,
|
||||
model selection.
|
||||
- `Seaborn <https://seaborn.pydata.org/>`__: statistical data visualization.
|
||||
- `rpy2 <https://rpy2.github.io/>`__: Python to R bridge.
|
||||
|
||||
|
||||
Probability distributions
|
||||
=========================
|
||||
|
||||
Each univariate distribution is an instance of a subclass of `rv_continuous`
|
||||
(`rv_discrete` for discrete distributions):
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
rv_continuous
|
||||
rv_discrete
|
||||
rv_histogram
|
||||
|
||||
Continuous distributions
|
||||
------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
alpha -- Alpha
|
||||
anglit -- Anglit
|
||||
arcsine -- Arcsine
|
||||
argus -- Argus
|
||||
beta -- Beta
|
||||
betaprime -- Beta Prime
|
||||
bradford -- Bradford
|
||||
burr -- Burr (Type III)
|
||||
burr12 -- Burr (Type XII)
|
||||
cauchy -- Cauchy
|
||||
chi -- Chi
|
||||
chi2 -- Chi-squared
|
||||
cosine -- Cosine
|
||||
crystalball -- Crystalball
|
||||
dgamma -- Double Gamma
|
||||
dweibull -- Double Weibull
|
||||
erlang -- Erlang
|
||||
expon -- Exponential
|
||||
exponnorm -- Exponentially Modified Normal
|
||||
exponweib -- Exponentiated Weibull
|
||||
exponpow -- Exponential Power
|
||||
f -- F (Snecdor F)
|
||||
fatiguelife -- Fatigue Life (Birnbaum-Saunders)
|
||||
fisk -- Fisk
|
||||
foldcauchy -- Folded Cauchy
|
||||
foldnorm -- Folded Normal
|
||||
genlogistic -- Generalized Logistic
|
||||
gennorm -- Generalized normal
|
||||
genpareto -- Generalized Pareto
|
||||
genexpon -- Generalized Exponential
|
||||
genextreme -- Generalized Extreme Value
|
||||
gausshyper -- Gauss Hypergeometric
|
||||
gamma -- Gamma
|
||||
gengamma -- Generalized gamma
|
||||
genhalflogistic -- Generalized Half Logistic
|
||||
genhyperbolic -- Generalized Hyperbolic
|
||||
geninvgauss -- Generalized Inverse Gaussian
|
||||
gibrat -- Gibrat
|
||||
gompertz -- Gompertz (Truncated Gumbel)
|
||||
gumbel_r -- Right Sided Gumbel, Log-Weibull, Fisher-Tippett, Extreme Value Type I
|
||||
gumbel_l -- Left Sided Gumbel, etc.
|
||||
halfcauchy -- Half Cauchy
|
||||
halflogistic -- Half Logistic
|
||||
halfnorm -- Half Normal
|
||||
halfgennorm -- Generalized Half Normal
|
||||
hypsecant -- Hyperbolic Secant
|
||||
invgamma -- Inverse Gamma
|
||||
invgauss -- Inverse Gaussian
|
||||
invweibull -- Inverse Weibull
|
||||
irwinhall -- Irwin-Hall
|
||||
jf_skew_t -- Jones and Faddy Skew-T
|
||||
johnsonsb -- Johnson SB
|
||||
johnsonsu -- Johnson SU
|
||||
kappa4 -- Kappa 4 parameter
|
||||
kappa3 -- Kappa 3 parameter
|
||||
ksone -- Distribution of Kolmogorov-Smirnov one-sided test statistic
|
||||
kstwo -- Distribution of Kolmogorov-Smirnov two-sided test statistic
|
||||
kstwobign -- Limiting Distribution of scaled Kolmogorov-Smirnov two-sided test statistic.
|
||||
laplace -- Laplace
|
||||
laplace_asymmetric -- Asymmetric Laplace
|
||||
levy -- Levy
|
||||
levy_l
|
||||
levy_stable
|
||||
logistic -- Logistic
|
||||
loggamma -- Log-Gamma
|
||||
loglaplace -- Log-Laplace (Log Double Exponential)
|
||||
lognorm -- Log-Normal
|
||||
loguniform -- Log-Uniform
|
||||
lomax -- Lomax (Pareto of the second kind)
|
||||
maxwell -- Maxwell
|
||||
mielke -- Mielke's Beta-Kappa
|
||||
moyal -- Moyal
|
||||
nakagami -- Nakagami
|
||||
ncx2 -- Non-central chi-squared
|
||||
ncf -- Non-central F
|
||||
nct -- Non-central Student's T
|
||||
norm -- Normal (Gaussian)
|
||||
norminvgauss -- Normal Inverse Gaussian
|
||||
pareto -- Pareto
|
||||
pearson3 -- Pearson type III
|
||||
powerlaw -- Power-function
|
||||
powerlognorm -- Power log normal
|
||||
powernorm -- Power normal
|
||||
rdist -- R-distribution
|
||||
rayleigh -- Rayleigh
|
||||
rel_breitwigner -- Relativistic Breit-Wigner
|
||||
rice -- Rice
|
||||
recipinvgauss -- Reciprocal Inverse Gaussian
|
||||
semicircular -- Semicircular
|
||||
skewcauchy -- Skew Cauchy
|
||||
skewnorm -- Skew normal
|
||||
studentized_range -- Studentized Range
|
||||
t -- Student's T
|
||||
trapezoid -- Trapezoidal
|
||||
triang -- Triangular
|
||||
truncexpon -- Truncated Exponential
|
||||
truncnorm -- Truncated Normal
|
||||
truncpareto -- Truncated Pareto
|
||||
truncweibull_min -- Truncated minimum Weibull distribution
|
||||
tukeylambda -- Tukey-Lambda
|
||||
uniform -- Uniform
|
||||
vonmises -- Von-Mises (Circular)
|
||||
vonmises_line -- Von-Mises (Line)
|
||||
wald -- Wald
|
||||
weibull_min -- Minimum Weibull (see Frechet)
|
||||
weibull_max -- Maximum Weibull (see Frechet)
|
||||
wrapcauchy -- Wrapped Cauchy
|
||||
|
||||
The ``fit`` method of the univariate continuous distributions uses
|
||||
maximum likelihood estimation to fit the distribution to a data set.
|
||||
The ``fit`` method can accept regular data or *censored data*.
|
||||
Censored data is represented with instances of the `CensoredData`
|
||||
class.
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
CensoredData
|
||||
|
||||
|
||||
Multivariate distributions
|
||||
--------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
multivariate_normal -- Multivariate normal distribution
|
||||
matrix_normal -- Matrix normal distribution
|
||||
dirichlet -- Dirichlet
|
||||
dirichlet_multinomial -- Dirichlet multinomial distribution
|
||||
wishart -- Wishart
|
||||
invwishart -- Inverse Wishart
|
||||
multinomial -- Multinomial distribution
|
||||
special_ortho_group -- SO(N) group
|
||||
ortho_group -- O(N) group
|
||||
unitary_group -- U(N) group
|
||||
random_correlation -- random correlation matrices
|
||||
multivariate_t -- Multivariate t-distribution
|
||||
multivariate_hypergeom -- Multivariate hypergeometric distribution
|
||||
random_table -- Distribution of random tables with given marginals
|
||||
uniform_direction -- Uniform distribution on S(N-1)
|
||||
vonmises_fisher -- Von Mises-Fisher distribution
|
||||
|
||||
`scipy.stats.multivariate_normal` methods accept instances
|
||||
of the following class to represent the covariance.
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
Covariance -- Representation of a covariance matrix
|
||||
|
||||
|
||||
Discrete distributions
|
||||
----------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
bernoulli -- Bernoulli
|
||||
betabinom -- Beta-Binomial
|
||||
betanbinom -- Beta-Negative Binomial
|
||||
binom -- Binomial
|
||||
boltzmann -- Boltzmann (Truncated Discrete Exponential)
|
||||
dlaplace -- Discrete Laplacian
|
||||
geom -- Geometric
|
||||
hypergeom -- Hypergeometric
|
||||
logser -- Logarithmic (Log-Series, Series)
|
||||
nbinom -- Negative Binomial
|
||||
nchypergeom_fisher -- Fisher's Noncentral Hypergeometric
|
||||
nchypergeom_wallenius -- Wallenius's Noncentral Hypergeometric
|
||||
nhypergeom -- Negative Hypergeometric
|
||||
planck -- Planck (Discrete Exponential)
|
||||
poisson -- Poisson
|
||||
randint -- Discrete Uniform
|
||||
skellam -- Skellam
|
||||
yulesimon -- Yule-Simon
|
||||
zipf -- Zipf (Zeta)
|
||||
zipfian -- Zipfian
|
||||
|
||||
|
||||
An overview of statistical functions is given below. Many of these functions
|
||||
have a similar version in `scipy.stats.mstats` which work for masked arrays.
|
||||
|
||||
Summary statistics
|
||||
==================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
describe -- Descriptive statistics
|
||||
gmean -- Geometric mean
|
||||
hmean -- Harmonic mean
|
||||
pmean -- Power mean
|
||||
kurtosis -- Fisher or Pearson kurtosis
|
||||
mode -- Modal value
|
||||
moment -- Central moment
|
||||
expectile -- Expectile
|
||||
skew -- Skewness
|
||||
kstat --
|
||||
kstatvar --
|
||||
tmean -- Truncated arithmetic mean
|
||||
tvar -- Truncated variance
|
||||
tmin --
|
||||
tmax --
|
||||
tstd --
|
||||
tsem --
|
||||
variation -- Coefficient of variation
|
||||
find_repeats
|
||||
rankdata
|
||||
tiecorrect
|
||||
trim_mean
|
||||
gstd -- Geometric Standard Deviation
|
||||
iqr
|
||||
sem
|
||||
bayes_mvs
|
||||
mvsdist
|
||||
entropy
|
||||
differential_entropy
|
||||
median_abs_deviation
|
||||
|
||||
Frequency statistics
|
||||
====================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
cumfreq
|
||||
percentileofscore
|
||||
scoreatpercentile
|
||||
relfreq
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
binned_statistic -- Compute a binned statistic for a set of data.
|
||||
binned_statistic_2d -- Compute a 2-D binned statistic for a set of data.
|
||||
binned_statistic_dd -- Compute a d-D binned statistic for a set of data.
|
||||
|
||||
.. _hypotests:
|
||||
|
||||
Hypothesis Tests and related functions
|
||||
======================================
|
||||
SciPy has many functions for performing hypothesis tests that return a
|
||||
test statistic and a p-value, and several of them return confidence intervals
|
||||
and/or other related information.
|
||||
|
||||
The headings below are based on common uses of the functions within, but due to
|
||||
the wide variety of statistical procedures, any attempt at coarse-grained
|
||||
categorization will be imperfect. Also, note that tests within the same heading
|
||||
are not interchangeable in general (e.g. many have different distributional
|
||||
assumptions).
|
||||
|
||||
One Sample Tests / Paired Sample Tests
|
||||
--------------------------------------
|
||||
One sample tests are typically used to assess whether a single sample was
|
||||
drawn from a specified distribution or a distribution with specified properties
|
||||
(e.g. zero mean).
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
ttest_1samp
|
||||
binomtest
|
||||
quantile_test
|
||||
skewtest
|
||||
kurtosistest
|
||||
normaltest
|
||||
jarque_bera
|
||||
shapiro
|
||||
anderson
|
||||
cramervonmises
|
||||
ks_1samp
|
||||
goodness_of_fit
|
||||
chisquare
|
||||
power_divergence
|
||||
|
||||
Paired sample tests are often used to assess whether two samples were drawn
|
||||
from the same distribution; they differ from the independent sample tests below
|
||||
in that each observation in one sample is treated as paired with a
|
||||
closely-related observation in the other sample (e.g. when environmental
|
||||
factors are controlled between observations within a pair but not among pairs).
|
||||
They can also be interpreted or used as one-sample tests (e.g. tests on the
|
||||
mean or median of *differences* between paired observations).
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
ttest_rel
|
||||
wilcoxon
|
||||
|
||||
Association/Correlation Tests
|
||||
-----------------------------
|
||||
|
||||
These tests are often used to assess whether there is a relationship (e.g.
|
||||
linear) between paired observations in multiple samples or among the
|
||||
coordinates of multivariate observations.
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
linregress
|
||||
pearsonr
|
||||
spearmanr
|
||||
pointbiserialr
|
||||
kendalltau
|
||||
weightedtau
|
||||
somersd
|
||||
siegelslopes
|
||||
theilslopes
|
||||
page_trend_test
|
||||
multiscale_graphcorr
|
||||
|
||||
These association tests and are to work with samples in the form of contingency
|
||||
tables. Supporting functions are available in `scipy.stats.contingency`.
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
chi2_contingency
|
||||
fisher_exact
|
||||
barnard_exact
|
||||
boschloo_exact
|
||||
|
||||
Independent Sample Tests
|
||||
------------------------
|
||||
Independent sample tests are typically used to assess whether multiple samples
|
||||
were independently drawn from the same distribution or different distributions
|
||||
with a shared property (e.g. equal means).
|
||||
|
||||
Some tests are specifically for comparing two samples.
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
ttest_ind_from_stats
|
||||
poisson_means_test
|
||||
ttest_ind
|
||||
mannwhitneyu
|
||||
bws_test
|
||||
ranksums
|
||||
brunnermunzel
|
||||
mood
|
||||
ansari
|
||||
cramervonmises_2samp
|
||||
epps_singleton_2samp
|
||||
ks_2samp
|
||||
kstest
|
||||
|
||||
Others are generalized to multiple samples.
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
f_oneway
|
||||
tukey_hsd
|
||||
dunnett
|
||||
kruskal
|
||||
alexandergovern
|
||||
fligner
|
||||
levene
|
||||
bartlett
|
||||
median_test
|
||||
friedmanchisquare
|
||||
anderson_ksamp
|
||||
|
||||
Resampling and Monte Carlo Methods
|
||||
----------------------------------
|
||||
The following functions can reproduce the p-value and confidence interval
|
||||
results of most of the functions above, and often produce accurate results in a
|
||||
wider variety of conditions. They can also be used to perform hypothesis tests
|
||||
and generate confidence intervals for custom statistics. This flexibility comes
|
||||
at the cost of greater computational requirements and stochastic results.
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
monte_carlo_test
|
||||
permutation_test
|
||||
bootstrap
|
||||
power
|
||||
|
||||
Instances of the following object can be passed into some hypothesis test
|
||||
functions to perform a resampling or Monte Carlo version of the hypothesis
|
||||
test.
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
MonteCarloMethod
|
||||
PermutationMethod
|
||||
BootstrapMethod
|
||||
|
||||
Multiple Hypothesis Testing and Meta-Analysis
|
||||
---------------------------------------------
|
||||
These functions are for assessing the results of individual tests as a whole.
|
||||
Functions for performing specific multiple hypothesis tests (e.g. post hoc
|
||||
tests) are listed above.
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
combine_pvalues
|
||||
false_discovery_control
|
||||
|
||||
|
||||
The following functions are related to the tests above but do not belong in the
|
||||
above categories.
|
||||
|
||||
Quasi-Monte Carlo
|
||||
=================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
stats.qmc
|
||||
|
||||
Contingency Tables
|
||||
==================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
stats.contingency
|
||||
|
||||
Masked statistics functions
|
||||
===========================
|
||||
|
||||
.. toctree::
|
||||
|
||||
stats.mstats
|
||||
|
||||
|
||||
Other statistical functionality
|
||||
===============================
|
||||
|
||||
Transformations
|
||||
---------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
boxcox
|
||||
boxcox_normmax
|
||||
boxcox_llf
|
||||
yeojohnson
|
||||
yeojohnson_normmax
|
||||
yeojohnson_llf
|
||||
obrientransform
|
||||
sigmaclip
|
||||
trimboth
|
||||
trim1
|
||||
zmap
|
||||
zscore
|
||||
gzscore
|
||||
|
||||
Statistical distances
|
||||
---------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
wasserstein_distance
|
||||
wasserstein_distance_nd
|
||||
energy_distance
|
||||
|
||||
Sampling
|
||||
--------
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
stats.sampling
|
||||
|
||||
Random variate generation / CDF Inversion
|
||||
-----------------------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
rvs_ratio_uniforms
|
||||
|
||||
Fitting / Survival Analysis
|
||||
---------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
fit
|
||||
ecdf
|
||||
logrank
|
||||
|
||||
Directional statistical functions
|
||||
---------------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
directional_stats
|
||||
circmean
|
||||
circvar
|
||||
circstd
|
||||
|
||||
Sensitivity Analysis
|
||||
--------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
sobol_indices
|
||||
|
||||
Plot-tests
|
||||
----------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
ppcc_max
|
||||
ppcc_plot
|
||||
probplot
|
||||
boxcox_normplot
|
||||
yeojohnson_normplot
|
||||
|
||||
Univariate and multivariate kernel density estimation
|
||||
-----------------------------------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
gaussian_kde
|
||||
|
||||
Warnings / Errors used in :mod:`scipy.stats`
|
||||
--------------------------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
DegenerateDataWarning
|
||||
ConstantInputWarning
|
||||
NearConstantInputWarning
|
||||
FitError
|
||||
|
||||
Result classes used in :mod:`scipy.stats`
|
||||
-----------------------------------------
|
||||
|
||||
.. warning::
|
||||
|
||||
These classes are private, but they are included here because instances
|
||||
of them are returned by other statistical functions. User import and
|
||||
instantiation is not supported.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
stats._result_classes
|
||||
|
||||
""" # noqa: E501
|
||||
|
||||
from ._warnings_errors import (ConstantInputWarning, NearConstantInputWarning,
|
||||
DegenerateDataWarning, FitError)
|
||||
from ._stats_py import *
|
||||
from ._variation import variation
|
||||
from .distributions import *
|
||||
from ._morestats import *
|
||||
from ._multicomp import *
|
||||
from ._binomtest import binomtest
|
||||
from ._binned_statistic import *
|
||||
from ._kde import gaussian_kde
|
||||
from . import mstats
|
||||
from . import qmc
|
||||
from ._multivariate import *
|
||||
from . import contingency
|
||||
from .contingency import chi2_contingency
|
||||
from ._censored_data import CensoredData
|
||||
from ._resampling import (bootstrap, monte_carlo_test, permutation_test, power,
|
||||
MonteCarloMethod, PermutationMethod, BootstrapMethod)
|
||||
from ._entropy import *
|
||||
from ._hypotests import *
|
||||
from ._rvs_sampling import rvs_ratio_uniforms
|
||||
from ._page_trend_test import page_trend_test
|
||||
from ._mannwhitneyu import mannwhitneyu
|
||||
from ._bws_test import bws_test
|
||||
from ._fit import fit, goodness_of_fit
|
||||
from ._covariance import Covariance
|
||||
from ._sensitivity_analysis import *
|
||||
from ._survival import *
|
||||
from ._mgc import multiscale_graphcorr
|
||||
|
||||
|
||||
# Deprecated namespaces, to be removed in v2.0.0
|
||||
from . import (
|
||||
biasedurn, kde, morestats, mstats_basic, mstats_extras, mvn, stats
|
||||
)
|
||||
|
||||
|
||||
__all__ = [s for s in dir() if not s.startswith("_")] # Remove dunders.
|
||||
|
||||
from scipy._lib._testutils import PytestTester
|
||||
test = PytestTester(__name__)
|
||||
del PytestTester
|
||||
Binary file not shown.
@ -0,0 +1,686 @@
|
||||
# Many scipy.stats functions support `axis` and `nan_policy` parameters.
|
||||
# When the two are combined, it can be tricky to get all the behavior just
|
||||
# right. This file contains utility functions useful for scipy.stats functions
|
||||
# that support `axis` and `nan_policy`, including a decorator that
|
||||
# automatically adds `axis` and `nan_policy` arguments to a function.
|
||||
|
||||
import warnings
|
||||
import numpy as np
|
||||
from functools import wraps
|
||||
from scipy._lib._docscrape import FunctionDoc, Parameter
|
||||
from scipy._lib._util import _contains_nan, AxisError, _get_nan
|
||||
from scipy._lib._array_api import array_namespace, is_numpy
|
||||
|
||||
import inspect
|
||||
|
||||
too_small_1d_not_omit = (
|
||||
"One or more sample arguments is too small; all "
|
||||
"returned values will be NaN. "
|
||||
"See documentation for sample size requirements.")
|
||||
|
||||
too_small_1d_omit = (
|
||||
"After omitting NaNs, one or more sample arguments "
|
||||
"is too small; all returned values will be NaN. "
|
||||
"See documentation for sample size requirements.")
|
||||
|
||||
too_small_nd_not_omit = (
|
||||
"All axis-slices of one or more sample arguments are "
|
||||
"too small; all elements of returned arrays will be NaN. "
|
||||
"See documentation for sample size requirements.")
|
||||
|
||||
too_small_nd_omit = (
|
||||
"After omitting NaNs, one or more axis-slices of one "
|
||||
"or more sample arguments is too small; corresponding "
|
||||
"elements of returned arrays will be NaN. "
|
||||
"See documentation for sample size requirements.")
|
||||
|
||||
class SmallSampleWarning(RuntimeWarning):
|
||||
pass
|
||||
|
||||
|
||||
def _broadcast_arrays(arrays, axis=None, xp=None):
|
||||
"""
|
||||
Broadcast shapes of arrays, ignoring incompatibility of specified axes
|
||||
"""
|
||||
if not arrays:
|
||||
return arrays
|
||||
xp = array_namespace(*arrays) if xp is None else xp
|
||||
arrays = [xp.asarray(arr) for arr in arrays]
|
||||
shapes = [arr.shape for arr in arrays]
|
||||
new_shapes = _broadcast_shapes(shapes, axis)
|
||||
if axis is None:
|
||||
new_shapes = [new_shapes]*len(arrays)
|
||||
return [xp.broadcast_to(array, new_shape)
|
||||
for array, new_shape in zip(arrays, new_shapes)]
|
||||
|
||||
|
||||
def _broadcast_shapes(shapes, axis=None):
|
||||
"""
|
||||
Broadcast shapes, ignoring incompatibility of specified axes
|
||||
"""
|
||||
if not shapes:
|
||||
return shapes
|
||||
|
||||
# input validation
|
||||
if axis is not None:
|
||||
axis = np.atleast_1d(axis)
|
||||
axis_int = axis.astype(int)
|
||||
if not np.array_equal(axis_int, axis):
|
||||
raise AxisError('`axis` must be an integer, a '
|
||||
'tuple of integers, or `None`.')
|
||||
axis = axis_int
|
||||
|
||||
# First, ensure all shapes have same number of dimensions by prepending 1s.
|
||||
n_dims = max([len(shape) for shape in shapes])
|
||||
new_shapes = np.ones((len(shapes), n_dims), dtype=int)
|
||||
for row, shape in zip(new_shapes, shapes):
|
||||
row[len(row)-len(shape):] = shape # can't use negative indices (-0:)
|
||||
|
||||
# Remove the shape elements of the axes to be ignored, but remember them.
|
||||
if axis is not None:
|
||||
axis[axis < 0] = n_dims + axis[axis < 0]
|
||||
axis = np.sort(axis)
|
||||
if axis[-1] >= n_dims or axis[0] < 0:
|
||||
message = (f"`axis` is out of bounds "
|
||||
f"for array of dimension {n_dims}")
|
||||
raise AxisError(message)
|
||||
|
||||
if len(np.unique(axis)) != len(axis):
|
||||
raise AxisError("`axis` must contain only distinct elements")
|
||||
|
||||
removed_shapes = new_shapes[:, axis]
|
||||
new_shapes = np.delete(new_shapes, axis, axis=1)
|
||||
|
||||
# If arrays are broadcastable, shape elements that are 1 may be replaced
|
||||
# with a corresponding non-1 shape element. Assuming arrays are
|
||||
# broadcastable, that final shape element can be found with:
|
||||
new_shape = np.max(new_shapes, axis=0)
|
||||
# except in case of an empty array:
|
||||
new_shape *= new_shapes.all(axis=0)
|
||||
|
||||
# Among all arrays, there can only be one unique non-1 shape element.
|
||||
# Therefore, if any non-1 shape element does not match what we found
|
||||
# above, the arrays must not be broadcastable after all.
|
||||
if np.any(~((new_shapes == 1) | (new_shapes == new_shape))):
|
||||
raise ValueError("Array shapes are incompatible for broadcasting.")
|
||||
|
||||
if axis is not None:
|
||||
# Add back the shape elements that were ignored
|
||||
new_axis = axis - np.arange(len(axis))
|
||||
new_shapes = [tuple(np.insert(new_shape, new_axis, removed_shape))
|
||||
for removed_shape in removed_shapes]
|
||||
return new_shapes
|
||||
else:
|
||||
return tuple(new_shape)
|
||||
|
||||
|
||||
def _broadcast_array_shapes_remove_axis(arrays, axis=None):
|
||||
"""
|
||||
Broadcast shapes of arrays, dropping specified axes
|
||||
|
||||
Given a sequence of arrays `arrays` and an integer or tuple `axis`, find
|
||||
the shape of the broadcast result after consuming/dropping `axis`.
|
||||
In other words, return output shape of a typical hypothesis test on
|
||||
`arrays` vectorized along `axis`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats._axis_nan_policy import _broadcast_array_shapes_remove_axis
|
||||
>>> a = np.zeros((5, 2, 1))
|
||||
>>> b = np.zeros((9, 3))
|
||||
>>> _broadcast_array_shapes_remove_axis((a, b), 1)
|
||||
(5, 3)
|
||||
"""
|
||||
# Note that here, `axis=None` means do not consume/drop any axes - _not_
|
||||
# ravel arrays before broadcasting.
|
||||
shapes = [arr.shape for arr in arrays]
|
||||
return _broadcast_shapes_remove_axis(shapes, axis)
|
||||
|
||||
|
||||
def _broadcast_shapes_remove_axis(shapes, axis=None):
|
||||
"""
|
||||
Broadcast shapes, dropping specified axes
|
||||
|
||||
Same as _broadcast_array_shapes_remove_axis, but given a sequence
|
||||
of array shapes `shapes` instead of the arrays themselves.
|
||||
"""
|
||||
shapes = _broadcast_shapes(shapes, axis)
|
||||
shape = shapes[0]
|
||||
if axis is not None:
|
||||
shape = np.delete(shape, axis)
|
||||
return tuple(shape)
|
||||
|
||||
|
||||
def _broadcast_concatenate(arrays, axis, paired=False):
|
||||
"""Concatenate arrays along an axis with broadcasting."""
|
||||
arrays = _broadcast_arrays(arrays, axis if not paired else None)
|
||||
res = np.concatenate(arrays, axis=axis)
|
||||
return res
|
||||
|
||||
|
||||
# TODO: add support for `axis` tuples
|
||||
def _remove_nans(samples, paired):
|
||||
"Remove nans from paired or unpaired 1D samples"
|
||||
# potential optimization: don't copy arrays that don't contain nans
|
||||
if not paired:
|
||||
return [sample[~np.isnan(sample)] for sample in samples]
|
||||
|
||||
# for paired samples, we need to remove the whole pair when any part
|
||||
# has a nan
|
||||
nans = np.isnan(samples[0])
|
||||
for sample in samples[1:]:
|
||||
nans = nans | np.isnan(sample)
|
||||
not_nans = ~nans
|
||||
return [sample[not_nans] for sample in samples]
|
||||
|
||||
|
||||
def _remove_sentinel(samples, paired, sentinel):
|
||||
"Remove sentinel values from paired or unpaired 1D samples"
|
||||
# could consolidate with `_remove_nans`, but it's not quite as simple as
|
||||
# passing `sentinel=np.nan` because `(np.nan == np.nan) is False`
|
||||
|
||||
# potential optimization: don't copy arrays that don't contain sentinel
|
||||
if not paired:
|
||||
return [sample[sample != sentinel] for sample in samples]
|
||||
|
||||
# for paired samples, we need to remove the whole pair when any part
|
||||
# has a nan
|
||||
sentinels = (samples[0] == sentinel)
|
||||
for sample in samples[1:]:
|
||||
sentinels = sentinels | (sample == sentinel)
|
||||
not_sentinels = ~sentinels
|
||||
return [sample[not_sentinels] for sample in samples]
|
||||
|
||||
|
||||
def _masked_arrays_2_sentinel_arrays(samples):
|
||||
# masked arrays in `samples` are converted to regular arrays, and values
|
||||
# corresponding with masked elements are replaced with a sentinel value
|
||||
|
||||
# return without modifying arrays if none have a mask
|
||||
has_mask = False
|
||||
for sample in samples:
|
||||
mask = getattr(sample, 'mask', False)
|
||||
has_mask = has_mask or np.any(mask)
|
||||
if not has_mask:
|
||||
return samples, None # None means there is no sentinel value
|
||||
|
||||
# Choose a sentinel value. We can't use `np.nan`, because sentinel (masked)
|
||||
# values are always omitted, but there are different nan policies.
|
||||
dtype = np.result_type(*samples)
|
||||
dtype = dtype if np.issubdtype(dtype, np.number) else np.float64
|
||||
for i in range(len(samples)):
|
||||
# Things get more complicated if the arrays are of different types.
|
||||
# We could have different sentinel values for each array, but
|
||||
# the purpose of this code is convenience, not efficiency.
|
||||
samples[i] = samples[i].astype(dtype, copy=False)
|
||||
|
||||
inexact = np.issubdtype(dtype, np.inexact)
|
||||
info = np.finfo if inexact else np.iinfo
|
||||
max_possible, min_possible = info(dtype).max, info(dtype).min
|
||||
nextafter = np.nextafter if inexact else (lambda x, _: x - 1)
|
||||
|
||||
sentinel = max_possible
|
||||
# For simplicity, min_possible/np.infs are not candidate sentinel values
|
||||
while sentinel > min_possible:
|
||||
for sample in samples:
|
||||
if np.any(sample == sentinel): # choose a new sentinel value
|
||||
sentinel = nextafter(sentinel, -np.inf)
|
||||
break
|
||||
else: # when sentinel value is OK, break the while loop
|
||||
break
|
||||
else:
|
||||
message = ("This function replaces masked elements with sentinel "
|
||||
"values, but the data contains all distinct values of this "
|
||||
"data type. Consider promoting the dtype to `np.float64`.")
|
||||
raise ValueError(message)
|
||||
|
||||
# replace masked elements with sentinel value
|
||||
out_samples = []
|
||||
for sample in samples:
|
||||
mask = getattr(sample, 'mask', None)
|
||||
if mask is not None: # turn all masked arrays into sentinel arrays
|
||||
mask = np.broadcast_to(mask, sample.shape)
|
||||
sample = sample.data.copy() if np.any(mask) else sample.data
|
||||
sample = np.asarray(sample) # `sample.data` could be a memoryview?
|
||||
sample[mask] = sentinel
|
||||
out_samples.append(sample)
|
||||
|
||||
return out_samples, sentinel
|
||||
|
||||
|
||||
def _check_empty_inputs(samples, axis):
|
||||
"""
|
||||
Check for empty sample; return appropriate output for a vectorized hypotest
|
||||
"""
|
||||
# if none of the samples are empty, we need to perform the test
|
||||
if not any(sample.size == 0 for sample in samples):
|
||||
return None
|
||||
# otherwise, the statistic and p-value will be either empty arrays or
|
||||
# arrays with NaNs. Produce the appropriate array and return it.
|
||||
output_shape = _broadcast_array_shapes_remove_axis(samples, axis)
|
||||
output = np.ones(output_shape) * _get_nan(*samples)
|
||||
return output
|
||||
|
||||
|
||||
def _add_reduced_axes(res, reduced_axes, keepdims):
|
||||
"""
|
||||
Add reduced axes back to all the arrays in the result object
|
||||
if keepdims = True.
|
||||
"""
|
||||
return ([np.expand_dims(output, reduced_axes)
|
||||
if not isinstance(output, int) else output for output in res]
|
||||
if keepdims else res)
|
||||
|
||||
|
||||
# Standard docstring / signature entries for `axis`, `nan_policy`, `keepdims`
|
||||
_name = 'axis'
|
||||
_desc = (
|
||||
"""If an int, the axis of the input along which to compute the statistic.
|
||||
The statistic of each axis-slice (e.g. row) of the input will appear in a
|
||||
corresponding element of the output.
|
||||
If ``None``, the input will be raveled before computing the statistic."""
|
||||
.split('\n'))
|
||||
|
||||
|
||||
def _get_axis_params(default_axis=0, _name=_name, _desc=_desc): # bind NOW
|
||||
_type = f"int or None, default: {default_axis}"
|
||||
_axis_parameter_doc = Parameter(_name, _type, _desc)
|
||||
_axis_parameter = inspect.Parameter(_name,
|
||||
inspect.Parameter.KEYWORD_ONLY,
|
||||
default=default_axis)
|
||||
return _axis_parameter_doc, _axis_parameter
|
||||
|
||||
|
||||
_name = 'nan_policy'
|
||||
_type = "{'propagate', 'omit', 'raise'}"
|
||||
_desc = (
|
||||
"""Defines how to handle input NaNs.
|
||||
|
||||
- ``propagate``: if a NaN is present in the axis slice (e.g. row) along
|
||||
which the statistic is computed, the corresponding entry of the output
|
||||
will be NaN.
|
||||
- ``omit``: NaNs will be omitted when performing the calculation.
|
||||
If insufficient data remains in the axis slice along which the
|
||||
statistic is computed, the corresponding entry of the output will be
|
||||
NaN.
|
||||
- ``raise``: if a NaN is present, a ``ValueError`` will be raised."""
|
||||
.split('\n'))
|
||||
_nan_policy_parameter_doc = Parameter(_name, _type, _desc)
|
||||
_nan_policy_parameter = inspect.Parameter(_name,
|
||||
inspect.Parameter.KEYWORD_ONLY,
|
||||
default='propagate')
|
||||
|
||||
_name = 'keepdims'
|
||||
_type = "bool, default: False"
|
||||
_desc = (
|
||||
"""If this is set to True, the axes which are reduced are left
|
||||
in the result as dimensions with size one. With this option,
|
||||
the result will broadcast correctly against the input array."""
|
||||
.split('\n'))
|
||||
_keepdims_parameter_doc = Parameter(_name, _type, _desc)
|
||||
_keepdims_parameter = inspect.Parameter(_name,
|
||||
inspect.Parameter.KEYWORD_ONLY,
|
||||
default=False)
|
||||
|
||||
_standard_note_addition = (
|
||||
"""\nBeginning in SciPy 1.9, ``np.matrix`` inputs (not recommended for new
|
||||
code) are converted to ``np.ndarray`` before the calculation is performed. In
|
||||
this case, the output will be a scalar or ``np.ndarray`` of appropriate shape
|
||||
rather than a 2D ``np.matrix``. Similarly, while masked elements of masked
|
||||
arrays are ignored, the output will be a scalar or ``np.ndarray`` rather than a
|
||||
masked array with ``mask=False``.""").split('\n')
|
||||
|
||||
|
||||
def _axis_nan_policy_factory(tuple_to_result, default_axis=0,
|
||||
n_samples=1, paired=False,
|
||||
result_to_tuple=None, too_small=0,
|
||||
n_outputs=2, kwd_samples=[], override=None):
|
||||
"""Factory for a wrapper that adds axis/nan_policy params to a function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
tuple_to_result : callable
|
||||
Callable that returns an object of the type returned by the function
|
||||
being wrapped (e.g. the namedtuple or dataclass returned by a
|
||||
statistical test) provided the separate components (e.g. statistic,
|
||||
pvalue).
|
||||
default_axis : int, default: 0
|
||||
The default value of the axis argument. Standard is 0 except when
|
||||
backwards compatibility demands otherwise (e.g. `None`).
|
||||
n_samples : int or callable, default: 1
|
||||
The number of data samples accepted by the function
|
||||
(e.g. `mannwhitneyu`), a callable that accepts a dictionary of
|
||||
parameters passed into the function and returns the number of data
|
||||
samples (e.g. `wilcoxon`), or `None` to indicate an arbitrary number
|
||||
of samples (e.g. `kruskal`).
|
||||
paired : {False, True}
|
||||
Whether the function being wrapped treats the samples as paired (i.e.
|
||||
corresponding elements of each sample should be considered as different
|
||||
components of the same sample.)
|
||||
result_to_tuple : callable, optional
|
||||
Function that unpacks the results of the function being wrapped into
|
||||
a tuple. This is essentially the inverse of `tuple_to_result`. Default
|
||||
is `None`, which is appropriate for statistical tests that return a
|
||||
statistic, pvalue tuple (rather than, e.g., a non-iterable datalass).
|
||||
too_small : int or callable, default: 0
|
||||
The largest unnacceptably small sample for the function being wrapped.
|
||||
For example, some functions require samples of size two or more or they
|
||||
raise an error. This argument prevents the error from being raised when
|
||||
input is not 1D and instead places a NaN in the corresponding element
|
||||
of the result. If callable, it must accept a list of samples, axis,
|
||||
and a dictionary of keyword arguments passed to the wrapper function as
|
||||
arguments and return a bool indicating weather the samples passed are
|
||||
too small.
|
||||
n_outputs : int or callable, default: 2
|
||||
The number of outputs produced by the function given 1d sample(s). For
|
||||
example, hypothesis tests that return a namedtuple or result object
|
||||
with attributes ``statistic`` and ``pvalue`` use the default
|
||||
``n_outputs=2``; summary statistics with scalar output use
|
||||
``n_outputs=1``. Alternatively, may be a callable that accepts a
|
||||
dictionary of arguments passed into the wrapped function and returns
|
||||
the number of outputs corresponding with those arguments.
|
||||
kwd_samples : sequence, default: []
|
||||
The names of keyword parameters that should be treated as samples. For
|
||||
example, `gmean` accepts as its first argument a sample `a` but
|
||||
also `weights` as a fourth, optional keyword argument. In this case, we
|
||||
use `n_samples=1` and kwd_samples=['weights'].
|
||||
override : dict, default: {'vectorization': False, 'nan_propagation': True}
|
||||
Pass a dictionary with ``'vectorization': True`` to ensure that the
|
||||
decorator overrides the function's behavior for multimensional input.
|
||||
Use ``'nan_propagation': False`` to ensure that the decorator does not
|
||||
override the function's behavior for ``nan_policy='propagate'``.
|
||||
"""
|
||||
# Specify which existing behaviors the decorator must override
|
||||
temp = override or {}
|
||||
override = {'vectorization': False,
|
||||
'nan_propagation': True}
|
||||
override.update(temp)
|
||||
|
||||
if result_to_tuple is None:
|
||||
def result_to_tuple(res):
|
||||
return res
|
||||
|
||||
if not callable(too_small):
|
||||
def is_too_small(samples, *ts_args, axis=-1, **ts_kwargs):
|
||||
for sample in samples:
|
||||
if sample.shape[axis] <= too_small:
|
||||
return True
|
||||
return False
|
||||
else:
|
||||
is_too_small = too_small
|
||||
|
||||
def axis_nan_policy_decorator(hypotest_fun_in):
|
||||
@wraps(hypotest_fun_in)
|
||||
def axis_nan_policy_wrapper(*args, _no_deco=False, **kwds):
|
||||
|
||||
if _no_deco: # for testing, decorator does nothing
|
||||
return hypotest_fun_in(*args, **kwds)
|
||||
|
||||
# For now, skip the decorator entirely if using array API. In the future,
|
||||
# we'll probably want to use it for `keepdims`, `axis` tuples, etc.
|
||||
if len(args) == 0: # extract sample from `kwds` if there are no `args`
|
||||
used_kwd_samples = list(set(kwds).intersection(set(kwd_samples)))
|
||||
temp = used_kwd_samples[:1]
|
||||
else:
|
||||
temp = args[0]
|
||||
|
||||
if not is_numpy(array_namespace(temp)):
|
||||
msg = ("Use of `nan_policy` and `keepdims` "
|
||||
"is incompatible with non-NumPy arrays.")
|
||||
if 'nan_policy' in kwds or 'keepdims' in kwds:
|
||||
raise NotImplementedError(msg)
|
||||
return hypotest_fun_in(*args, **kwds)
|
||||
|
||||
# We need to be flexible about whether position or keyword
|
||||
# arguments are used, but we need to make sure users don't pass
|
||||
# both for the same parameter. To complicate matters, some
|
||||
# functions accept samples with *args, and some functions already
|
||||
# accept `axis` and `nan_policy` as positional arguments.
|
||||
# The strategy is to make sure that there is no duplication
|
||||
# between `args` and `kwds`, combine the two into `kwds`, then
|
||||
# the samples, `nan_policy`, and `axis` from `kwds`, as they are
|
||||
# dealt with separately.
|
||||
|
||||
# Check for intersection between positional and keyword args
|
||||
params = list(inspect.signature(hypotest_fun_in).parameters)
|
||||
if n_samples is None:
|
||||
# Give unique names to each positional sample argument
|
||||
# Note that *args can't be provided as a keyword argument
|
||||
params = [f"arg{i}" for i in range(len(args))] + params[1:]
|
||||
|
||||
# raise if there are too many positional args
|
||||
maxarg = (np.inf if inspect.getfullargspec(hypotest_fun_in).varargs
|
||||
else len(inspect.getfullargspec(hypotest_fun_in).args))
|
||||
if len(args) > maxarg: # let the function raise the right error
|
||||
hypotest_fun_in(*args, **kwds)
|
||||
|
||||
# raise if multiple values passed for same parameter
|
||||
d_args = dict(zip(params, args))
|
||||
intersection = set(d_args) & set(kwds)
|
||||
if intersection: # let the function raise the right error
|
||||
hypotest_fun_in(*args, **kwds)
|
||||
|
||||
# Consolidate other positional and keyword args into `kwds`
|
||||
kwds.update(d_args)
|
||||
|
||||
# rename avoids UnboundLocalError
|
||||
if callable(n_samples):
|
||||
# Future refactoring idea: no need for callable n_samples.
|
||||
# Just replace `n_samples` and `kwd_samples` with a single
|
||||
# list of the names of all samples, and treat all of them
|
||||
# as `kwd_samples` are treated below.
|
||||
n_samp = n_samples(kwds)
|
||||
else:
|
||||
n_samp = n_samples or len(args)
|
||||
|
||||
# get the number of outputs
|
||||
n_out = n_outputs # rename to avoid UnboundLocalError
|
||||
if callable(n_out):
|
||||
n_out = n_out(kwds)
|
||||
|
||||
# If necessary, rearrange function signature: accept other samples
|
||||
# as positional args right after the first n_samp args
|
||||
kwd_samp = [name for name in kwd_samples
|
||||
if kwds.get(name, None) is not None]
|
||||
n_kwd_samp = len(kwd_samp)
|
||||
if not kwd_samp:
|
||||
hypotest_fun_out = hypotest_fun_in
|
||||
else:
|
||||
def hypotest_fun_out(*samples, **kwds):
|
||||
new_kwds = dict(zip(kwd_samp, samples[n_samp:]))
|
||||
kwds.update(new_kwds)
|
||||
return hypotest_fun_in(*samples[:n_samp], **kwds)
|
||||
|
||||
# Extract the things we need here
|
||||
try: # if something is missing
|
||||
samples = [np.atleast_1d(kwds.pop(param))
|
||||
for param in (params[:n_samp] + kwd_samp)]
|
||||
except KeyError: # let the function raise the right error
|
||||
# might need to revisit this if required arg is not a "sample"
|
||||
hypotest_fun_in(*args, **kwds)
|
||||
vectorized = True if 'axis' in params else False
|
||||
vectorized = vectorized and not override['vectorization']
|
||||
axis = kwds.pop('axis', default_axis)
|
||||
nan_policy = kwds.pop('nan_policy', 'propagate')
|
||||
keepdims = kwds.pop("keepdims", False)
|
||||
del args # avoid the possibility of passing both `args` and `kwds`
|
||||
|
||||
# convert masked arrays to regular arrays with sentinel values
|
||||
samples, sentinel = _masked_arrays_2_sentinel_arrays(samples)
|
||||
|
||||
# standardize to always work along last axis
|
||||
reduced_axes = axis
|
||||
if axis is None:
|
||||
if samples:
|
||||
# when axis=None, take the maximum of all dimensions since
|
||||
# all the dimensions are reduced.
|
||||
n_dims = np.max([sample.ndim for sample in samples])
|
||||
reduced_axes = tuple(range(n_dims))
|
||||
samples = [np.asarray(sample.ravel()) for sample in samples]
|
||||
else:
|
||||
samples = _broadcast_arrays(samples, axis=axis)
|
||||
axis = np.atleast_1d(axis)
|
||||
n_axes = len(axis)
|
||||
# move all axes in `axis` to the end to be raveled
|
||||
samples = [np.moveaxis(sample, axis, range(-len(axis), 0))
|
||||
for sample in samples]
|
||||
shapes = [sample.shape for sample in samples]
|
||||
# New shape is unchanged for all axes _not_ in `axis`
|
||||
# At the end, we append the product of the shapes of the axes
|
||||
# in `axis`. Appending -1 doesn't work for zero-size arrays!
|
||||
new_shapes = [shape[:-n_axes] + (np.prod(shape[-n_axes:]),)
|
||||
for shape in shapes]
|
||||
samples = [sample.reshape(new_shape)
|
||||
for sample, new_shape in zip(samples, new_shapes)]
|
||||
axis = -1 # work over the last axis
|
||||
NaN = _get_nan(*samples) if samples else np.nan
|
||||
|
||||
# if axis is not needed, just handle nan_policy and return
|
||||
ndims = np.array([sample.ndim for sample in samples])
|
||||
if np.all(ndims <= 1):
|
||||
# Addresses nan_policy == "raise"
|
||||
if nan_policy != 'propagate' or override['nan_propagation']:
|
||||
contains_nan = [_contains_nan(sample, nan_policy)[0]
|
||||
for sample in samples]
|
||||
else:
|
||||
# Behave as though there are no NaNs (even if there are)
|
||||
contains_nan = [False]*len(samples)
|
||||
|
||||
# Addresses nan_policy == "propagate"
|
||||
if any(contains_nan) and (nan_policy == 'propagate'
|
||||
and override['nan_propagation']):
|
||||
res = np.full(n_out, NaN)
|
||||
res = _add_reduced_axes(res, reduced_axes, keepdims)
|
||||
return tuple_to_result(*res)
|
||||
|
||||
# Addresses nan_policy == "omit"
|
||||
too_small_msg = too_small_1d_not_omit
|
||||
if any(contains_nan) and nan_policy == 'omit':
|
||||
# consider passing in contains_nan
|
||||
samples = _remove_nans(samples, paired)
|
||||
too_small_msg = too_small_1d_omit
|
||||
|
||||
if sentinel:
|
||||
samples = _remove_sentinel(samples, paired, sentinel)
|
||||
|
||||
if is_too_small(samples, kwds):
|
||||
warnings.warn(too_small_msg, SmallSampleWarning, stacklevel=2)
|
||||
res = np.full(n_out, NaN)
|
||||
res = _add_reduced_axes(res, reduced_axes, keepdims)
|
||||
return tuple_to_result(*res)
|
||||
|
||||
res = hypotest_fun_out(*samples, **kwds)
|
||||
res = result_to_tuple(res)
|
||||
res = _add_reduced_axes(res, reduced_axes, keepdims)
|
||||
return tuple_to_result(*res)
|
||||
|
||||
# check for empty input
|
||||
empty_output = _check_empty_inputs(samples, axis)
|
||||
# only return empty output if zero sized input is too small.
|
||||
if (
|
||||
empty_output is not None
|
||||
and (is_too_small(samples, kwds) or empty_output.size == 0)
|
||||
):
|
||||
if is_too_small(samples, kwds) and empty_output.size != 0:
|
||||
warnings.warn(too_small_nd_not_omit, SmallSampleWarning,
|
||||
stacklevel=2)
|
||||
res = [empty_output.copy() for i in range(n_out)]
|
||||
res = _add_reduced_axes(res, reduced_axes, keepdims)
|
||||
return tuple_to_result(*res)
|
||||
|
||||
# otherwise, concatenate all samples along axis, remembering where
|
||||
# each separate sample begins
|
||||
lengths = np.array([sample.shape[axis] for sample in samples])
|
||||
split_indices = np.cumsum(lengths)
|
||||
x = _broadcast_concatenate(samples, axis)
|
||||
|
||||
# Addresses nan_policy == "raise"
|
||||
if nan_policy != 'propagate' or override['nan_propagation']:
|
||||
contains_nan, _ = _contains_nan(x, nan_policy)
|
||||
else:
|
||||
contains_nan = False # behave like there are no NaNs
|
||||
|
||||
if vectorized and not contains_nan and not sentinel:
|
||||
res = hypotest_fun_out(*samples, axis=axis, **kwds)
|
||||
res = result_to_tuple(res)
|
||||
res = _add_reduced_axes(res, reduced_axes, keepdims)
|
||||
return tuple_to_result(*res)
|
||||
|
||||
# Addresses nan_policy == "omit"
|
||||
if contains_nan and nan_policy == 'omit':
|
||||
def hypotest_fun(x):
|
||||
samples = np.split(x, split_indices)[:n_samp+n_kwd_samp]
|
||||
samples = _remove_nans(samples, paired)
|
||||
if sentinel:
|
||||
samples = _remove_sentinel(samples, paired, sentinel)
|
||||
if is_too_small(samples, kwds):
|
||||
warnings.warn(too_small_nd_omit, SmallSampleWarning,
|
||||
stacklevel=4)
|
||||
return np.full(n_out, NaN)
|
||||
return result_to_tuple(hypotest_fun_out(*samples, **kwds))
|
||||
|
||||
# Addresses nan_policy == "propagate"
|
||||
elif (contains_nan and nan_policy == 'propagate'
|
||||
and override['nan_propagation']):
|
||||
def hypotest_fun(x):
|
||||
if np.isnan(x).any():
|
||||
return np.full(n_out, NaN)
|
||||
|
||||
samples = np.split(x, split_indices)[:n_samp+n_kwd_samp]
|
||||
if sentinel:
|
||||
samples = _remove_sentinel(samples, paired, sentinel)
|
||||
if is_too_small(samples, kwds):
|
||||
return np.full(n_out, NaN)
|
||||
return result_to_tuple(hypotest_fun_out(*samples, **kwds))
|
||||
|
||||
else:
|
||||
def hypotest_fun(x):
|
||||
samples = np.split(x, split_indices)[:n_samp+n_kwd_samp]
|
||||
if sentinel:
|
||||
samples = _remove_sentinel(samples, paired, sentinel)
|
||||
if is_too_small(samples, kwds):
|
||||
return np.full(n_out, NaN)
|
||||
return result_to_tuple(hypotest_fun_out(*samples, **kwds))
|
||||
|
||||
x = np.moveaxis(x, axis, 0)
|
||||
res = np.apply_along_axis(hypotest_fun, axis=0, arr=x)
|
||||
res = _add_reduced_axes(res, reduced_axes, keepdims)
|
||||
return tuple_to_result(*res)
|
||||
|
||||
_axis_parameter_doc, _axis_parameter = _get_axis_params(default_axis)
|
||||
doc = FunctionDoc(axis_nan_policy_wrapper)
|
||||
parameter_names = [param.name for param in doc['Parameters']]
|
||||
if 'axis' in parameter_names:
|
||||
doc['Parameters'][parameter_names.index('axis')] = (
|
||||
_axis_parameter_doc)
|
||||
else:
|
||||
doc['Parameters'].append(_axis_parameter_doc)
|
||||
if 'nan_policy' in parameter_names:
|
||||
doc['Parameters'][parameter_names.index('nan_policy')] = (
|
||||
_nan_policy_parameter_doc)
|
||||
else:
|
||||
doc['Parameters'].append(_nan_policy_parameter_doc)
|
||||
if 'keepdims' in parameter_names:
|
||||
doc['Parameters'][parameter_names.index('keepdims')] = (
|
||||
_keepdims_parameter_doc)
|
||||
else:
|
||||
doc['Parameters'].append(_keepdims_parameter_doc)
|
||||
doc['Notes'] += _standard_note_addition
|
||||
doc = str(doc).split("\n", 1)[1] # remove signature
|
||||
axis_nan_policy_wrapper.__doc__ = str(doc)
|
||||
|
||||
sig = inspect.signature(axis_nan_policy_wrapper)
|
||||
parameters = sig.parameters
|
||||
parameter_list = list(parameters.values())
|
||||
if 'axis' not in parameters:
|
||||
parameter_list.append(_axis_parameter)
|
||||
if 'nan_policy' not in parameters:
|
||||
parameter_list.append(_nan_policy_parameter)
|
||||
if 'keepdims' not in parameters:
|
||||
parameter_list.append(_keepdims_parameter)
|
||||
sig = sig.replace(parameters=parameter_list)
|
||||
axis_nan_policy_wrapper.__signature__ = sig
|
||||
|
||||
return axis_nan_policy_wrapper
|
||||
return axis_nan_policy_decorator
|
||||
Binary file not shown.
27
venv/lib/python3.12/site-packages/scipy/stats/_biasedurn.pxd
Normal file
27
venv/lib/python3.12/site-packages/scipy/stats/_biasedurn.pxd
Normal file
@ -0,0 +1,27 @@
|
||||
# Declare the class with cdef
|
||||
cdef extern from "biasedurn/stocc.h" nogil:
|
||||
cdef cppclass CFishersNCHypergeometric:
|
||||
CFishersNCHypergeometric(int, int, int, double, double) except +
|
||||
int mode()
|
||||
double mean()
|
||||
double variance()
|
||||
double probability(int x)
|
||||
double moments(double * mean, double * var)
|
||||
|
||||
cdef cppclass CWalleniusNCHypergeometric:
|
||||
CWalleniusNCHypergeometric() except +
|
||||
CWalleniusNCHypergeometric(int, int, int, double, double) except +
|
||||
int mode()
|
||||
double mean()
|
||||
double variance()
|
||||
double probability(int x)
|
||||
double moments(double * mean, double * var)
|
||||
|
||||
cdef cppclass StochasticLib3:
|
||||
StochasticLib3(int seed) except +
|
||||
double Random() except +
|
||||
void SetAccuracy(double accur)
|
||||
int FishersNCHyp (int n, int m, int N, double odds) except +
|
||||
int WalleniusNCHyp (int n, int m, int N, double odds) except +
|
||||
double(*next_double)()
|
||||
double(*next_normal)(const double m, const double s)
|
||||
@ -0,0 +1,795 @@
|
||||
import builtins
|
||||
from warnings import catch_warnings, simplefilter
|
||||
import numpy as np
|
||||
from operator import index
|
||||
from collections import namedtuple
|
||||
|
||||
__all__ = ['binned_statistic',
|
||||
'binned_statistic_2d',
|
||||
'binned_statistic_dd']
|
||||
|
||||
|
||||
BinnedStatisticResult = namedtuple('BinnedStatisticResult',
|
||||
('statistic', 'bin_edges', 'binnumber'))
|
||||
|
||||
|
||||
def binned_statistic(x, values, statistic='mean',
|
||||
bins=10, range=None):
|
||||
"""
|
||||
Compute a binned statistic for one or more sets of data.
|
||||
|
||||
This is a generalization of a histogram function. A histogram divides
|
||||
the space into bins, and returns the count of the number of points in
|
||||
each bin. This function allows the computation of the sum, mean, median,
|
||||
or other statistic of the values (or set of values) within each bin.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : (N,) array_like
|
||||
A sequence of values to be binned.
|
||||
values : (N,) array_like or list of (N,) array_like
|
||||
The data on which the statistic will be computed. This must be
|
||||
the same shape as `x`, or a set of sequences - each the same shape as
|
||||
`x`. If `values` is a set of sequences, the statistic will be computed
|
||||
on each independently.
|
||||
statistic : string or callable, optional
|
||||
The statistic to compute (default is 'mean').
|
||||
The following statistics are available:
|
||||
|
||||
* 'mean' : compute the mean of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'std' : compute the standard deviation within each bin. This
|
||||
is implicitly calculated with ddof=0.
|
||||
* 'median' : compute the median of values for points within each
|
||||
bin. Empty bins will be represented by NaN.
|
||||
* 'count' : compute the count of points within each bin. This is
|
||||
identical to an unweighted histogram. `values` array is not
|
||||
referenced.
|
||||
* 'sum' : compute the sum of values for points within each bin.
|
||||
This is identical to a weighted histogram.
|
||||
* 'min' : compute the minimum of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'max' : compute the maximum of values for point within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* function : a user-defined function which takes a 1D array of
|
||||
values, and outputs a single numerical statistic. This function
|
||||
will be called on the values in each bin. Empty bins will be
|
||||
represented by function([]), or NaN if this returns an error.
|
||||
|
||||
bins : int or sequence of scalars, optional
|
||||
If `bins` is an int, it defines the number of equal-width bins in the
|
||||
given range (10 by default). If `bins` is a sequence, it defines the
|
||||
bin edges, including the rightmost edge, allowing for non-uniform bin
|
||||
widths. Values in `x` that are smaller than lowest bin edge are
|
||||
assigned to bin number 0, values beyond the highest bin are assigned to
|
||||
``bins[-1]``. If the bin edges are specified, the number of bins will
|
||||
be, (nx = len(bins)-1).
|
||||
range : (float, float) or [(float, float)], optional
|
||||
The lower and upper range of the bins. If not provided, range
|
||||
is simply ``(x.min(), x.max())``. Values outside the range are
|
||||
ignored.
|
||||
|
||||
Returns
|
||||
-------
|
||||
statistic : array
|
||||
The values of the selected statistic in each bin.
|
||||
bin_edges : array of dtype float
|
||||
Return the bin edges ``(length(statistic)+1)``.
|
||||
binnumber: 1-D ndarray of ints
|
||||
Indices of the bins (corresponding to `bin_edges`) in which each value
|
||||
of `x` belongs. Same length as `values`. A binnumber of `i` means the
|
||||
corresponding value is between (bin_edges[i-1], bin_edges[i]).
|
||||
|
||||
See Also
|
||||
--------
|
||||
numpy.digitize, numpy.histogram, binned_statistic_2d, binned_statistic_dd
|
||||
|
||||
Notes
|
||||
-----
|
||||
All but the last (righthand-most) bin is half-open. In other words, if
|
||||
`bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
|
||||
but excluding 2) and the second ``[2, 3)``. The last bin, however, is
|
||||
``[3, 4]``, which *includes* 4.
|
||||
|
||||
.. versionadded:: 0.11.0
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> import matplotlib.pyplot as plt
|
||||
|
||||
First some basic examples:
|
||||
|
||||
Create two evenly spaced bins in the range of the given sample, and sum the
|
||||
corresponding values in each of those bins:
|
||||
|
||||
>>> values = [1.0, 1.0, 2.0, 1.5, 3.0]
|
||||
>>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
|
||||
BinnedStatisticResult(statistic=array([4. , 4.5]),
|
||||
bin_edges=array([1., 4., 7.]), binnumber=array([1, 1, 1, 2, 2]))
|
||||
|
||||
Multiple arrays of values can also be passed. The statistic is calculated
|
||||
on each set independently:
|
||||
|
||||
>>> values = [[1.0, 1.0, 2.0, 1.5, 3.0], [2.0, 2.0, 4.0, 3.0, 6.0]]
|
||||
>>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
|
||||
BinnedStatisticResult(statistic=array([[4. , 4.5],
|
||||
[8. , 9. ]]), bin_edges=array([1., 4., 7.]),
|
||||
binnumber=array([1, 1, 1, 2, 2]))
|
||||
|
||||
>>> stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean',
|
||||
... bins=3)
|
||||
BinnedStatisticResult(statistic=array([1., 2., 4.]),
|
||||
bin_edges=array([1., 2., 3., 4.]),
|
||||
binnumber=array([1, 2, 1, 2, 3]))
|
||||
|
||||
As a second example, we now generate some random data of sailing boat speed
|
||||
as a function of wind speed, and then determine how fast our boat is for
|
||||
certain wind speeds:
|
||||
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> windspeed = 8 * rng.random(500)
|
||||
>>> boatspeed = .3 * windspeed**.5 + .2 * rng.random(500)
|
||||
>>> bin_means, bin_edges, binnumber = stats.binned_statistic(windspeed,
|
||||
... boatspeed, statistic='median', bins=[1,2,3,4,5,6,7])
|
||||
>>> plt.figure()
|
||||
>>> plt.plot(windspeed, boatspeed, 'b.', label='raw data')
|
||||
>>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=5,
|
||||
... label='binned statistic of data')
|
||||
>>> plt.legend()
|
||||
|
||||
Now we can use ``binnumber`` to select all datapoints with a windspeed
|
||||
below 1:
|
||||
|
||||
>>> low_boatspeed = boatspeed[binnumber == 0]
|
||||
|
||||
As a final example, we will use ``bin_edges`` and ``binnumber`` to make a
|
||||
plot of a distribution that shows the mean and distribution around that
|
||||
mean per bin, on top of a regular histogram and the probability
|
||||
distribution function:
|
||||
|
||||
>>> x = np.linspace(0, 5, num=500)
|
||||
>>> x_pdf = stats.maxwell.pdf(x)
|
||||
>>> samples = stats.maxwell.rvs(size=10000)
|
||||
|
||||
>>> bin_means, bin_edges, binnumber = stats.binned_statistic(x, x_pdf,
|
||||
... statistic='mean', bins=25)
|
||||
>>> bin_width = (bin_edges[1] - bin_edges[0])
|
||||
>>> bin_centers = bin_edges[1:] - bin_width/2
|
||||
|
||||
>>> plt.figure()
|
||||
>>> plt.hist(samples, bins=50, density=True, histtype='stepfilled',
|
||||
... alpha=0.2, label='histogram of data')
|
||||
>>> plt.plot(x, x_pdf, 'r-', label='analytical pdf')
|
||||
>>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=2,
|
||||
... label='binned statistic of data')
|
||||
>>> plt.plot((binnumber - 0.5) * bin_width, x_pdf, 'g.', alpha=0.5)
|
||||
>>> plt.legend(fontsize=10)
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
try:
|
||||
N = len(bins)
|
||||
except TypeError:
|
||||
N = 1
|
||||
|
||||
if N != 1:
|
||||
bins = [np.asarray(bins, float)]
|
||||
|
||||
if range is not None:
|
||||
if len(range) == 2:
|
||||
range = [range]
|
||||
|
||||
medians, edges, binnumbers = binned_statistic_dd(
|
||||
[x], values, statistic, bins, range)
|
||||
|
||||
return BinnedStatisticResult(medians, edges[0], binnumbers)
|
||||
|
||||
|
||||
BinnedStatistic2dResult = namedtuple('BinnedStatistic2dResult',
|
||||
('statistic', 'x_edge', 'y_edge',
|
||||
'binnumber'))
|
||||
|
||||
|
||||
def binned_statistic_2d(x, y, values, statistic='mean',
|
||||
bins=10, range=None, expand_binnumbers=False):
|
||||
"""
|
||||
Compute a bidimensional binned statistic for one or more sets of data.
|
||||
|
||||
This is a generalization of a histogram2d function. A histogram divides
|
||||
the space into bins, and returns the count of the number of points in
|
||||
each bin. This function allows the computation of the sum, mean, median,
|
||||
or other statistic of the values (or set of values) within each bin.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : (N,) array_like
|
||||
A sequence of values to be binned along the first dimension.
|
||||
y : (N,) array_like
|
||||
A sequence of values to be binned along the second dimension.
|
||||
values : (N,) array_like or list of (N,) array_like
|
||||
The data on which the statistic will be computed. This must be
|
||||
the same shape as `x`, or a list of sequences - each with the same
|
||||
shape as `x`. If `values` is such a list, the statistic will be
|
||||
computed on each independently.
|
||||
statistic : string or callable, optional
|
||||
The statistic to compute (default is 'mean').
|
||||
The following statistics are available:
|
||||
|
||||
* 'mean' : compute the mean of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'std' : compute the standard deviation within each bin. This
|
||||
is implicitly calculated with ddof=0.
|
||||
* 'median' : compute the median of values for points within each
|
||||
bin. Empty bins will be represented by NaN.
|
||||
* 'count' : compute the count of points within each bin. This is
|
||||
identical to an unweighted histogram. `values` array is not
|
||||
referenced.
|
||||
* 'sum' : compute the sum of values for points within each bin.
|
||||
This is identical to a weighted histogram.
|
||||
* 'min' : compute the minimum of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'max' : compute the maximum of values for point within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* function : a user-defined function which takes a 1D array of
|
||||
values, and outputs a single numerical statistic. This function
|
||||
will be called on the values in each bin. Empty bins will be
|
||||
represented by function([]), or NaN if this returns an error.
|
||||
|
||||
bins : int or [int, int] or array_like or [array, array], optional
|
||||
The bin specification:
|
||||
|
||||
* the number of bins for the two dimensions (nx = ny = bins),
|
||||
* the number of bins in each dimension (nx, ny = bins),
|
||||
* the bin edges for the two dimensions (x_edge = y_edge = bins),
|
||||
* the bin edges in each dimension (x_edge, y_edge = bins).
|
||||
|
||||
If the bin edges are specified, the number of bins will be,
|
||||
(nx = len(x_edge)-1, ny = len(y_edge)-1).
|
||||
|
||||
range : (2,2) array_like, optional
|
||||
The leftmost and rightmost edges of the bins along each dimension
|
||||
(if not specified explicitly in the `bins` parameters):
|
||||
[[xmin, xmax], [ymin, ymax]]. All values outside of this range will be
|
||||
considered outliers and not tallied in the histogram.
|
||||
expand_binnumbers : bool, optional
|
||||
'False' (default): the returned `binnumber` is a shape (N,) array of
|
||||
linearized bin indices.
|
||||
'True': the returned `binnumber` is 'unraveled' into a shape (2,N)
|
||||
ndarray, where each row gives the bin numbers in the corresponding
|
||||
dimension.
|
||||
See the `binnumber` returned value, and the `Examples` section.
|
||||
|
||||
.. versionadded:: 0.17.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
statistic : (nx, ny) ndarray
|
||||
The values of the selected statistic in each two-dimensional bin.
|
||||
x_edge : (nx + 1) ndarray
|
||||
The bin edges along the first dimension.
|
||||
y_edge : (ny + 1) ndarray
|
||||
The bin edges along the second dimension.
|
||||
binnumber : (N,) array of ints or (2,N) ndarray of ints
|
||||
This assigns to each element of `sample` an integer that represents the
|
||||
bin in which this observation falls. The representation depends on the
|
||||
`expand_binnumbers` argument. See `Notes` for details.
|
||||
|
||||
|
||||
See Also
|
||||
--------
|
||||
numpy.digitize, numpy.histogram2d, binned_statistic, binned_statistic_dd
|
||||
|
||||
Notes
|
||||
-----
|
||||
Binedges:
|
||||
All but the last (righthand-most) bin is half-open. In other words, if
|
||||
`bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
|
||||
but excluding 2) and the second ``[2, 3)``. The last bin, however, is
|
||||
``[3, 4]``, which *includes* 4.
|
||||
|
||||
`binnumber`:
|
||||
This returned argument assigns to each element of `sample` an integer that
|
||||
represents the bin in which it belongs. The representation depends on the
|
||||
`expand_binnumbers` argument. If 'False' (default): The returned
|
||||
`binnumber` is a shape (N,) array of linearized indices mapping each
|
||||
element of `sample` to its corresponding bin (using row-major ordering).
|
||||
Note that the returned linearized bin indices are used for an array with
|
||||
extra bins on the outer binedges to capture values outside of the defined
|
||||
bin bounds.
|
||||
If 'True': The returned `binnumber` is a shape (2,N) ndarray where
|
||||
each row indicates bin placements for each dimension respectively. In each
|
||||
dimension, a binnumber of `i` means the corresponding value is between
|
||||
(D_edge[i-1], D_edge[i]), where 'D' is either 'x' or 'y'.
|
||||
|
||||
.. versionadded:: 0.11.0
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy import stats
|
||||
|
||||
Calculate the counts with explicit bin-edges:
|
||||
|
||||
>>> x = [0.1, 0.1, 0.1, 0.6]
|
||||
>>> y = [2.1, 2.6, 2.1, 2.1]
|
||||
>>> binx = [0.0, 0.5, 1.0]
|
||||
>>> biny = [2.0, 2.5, 3.0]
|
||||
>>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny])
|
||||
>>> ret.statistic
|
||||
array([[2., 1.],
|
||||
[1., 0.]])
|
||||
|
||||
The bin in which each sample is placed is given by the `binnumber`
|
||||
returned parameter. By default, these are the linearized bin indices:
|
||||
|
||||
>>> ret.binnumber
|
||||
array([5, 6, 5, 9])
|
||||
|
||||
The bin indices can also be expanded into separate entries for each
|
||||
dimension using the `expand_binnumbers` parameter:
|
||||
|
||||
>>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny],
|
||||
... expand_binnumbers=True)
|
||||
>>> ret.binnumber
|
||||
array([[1, 1, 1, 2],
|
||||
[1, 2, 1, 1]])
|
||||
|
||||
Which shows that the first three elements belong in the xbin 1, and the
|
||||
fourth into xbin 2; and so on for y.
|
||||
|
||||
"""
|
||||
|
||||
# This code is based on np.histogram2d
|
||||
try:
|
||||
N = len(bins)
|
||||
except TypeError:
|
||||
N = 1
|
||||
|
||||
if N != 1 and N != 2:
|
||||
xedges = yedges = np.asarray(bins, float)
|
||||
bins = [xedges, yedges]
|
||||
|
||||
medians, edges, binnumbers = binned_statistic_dd(
|
||||
[x, y], values, statistic, bins, range,
|
||||
expand_binnumbers=expand_binnumbers)
|
||||
|
||||
return BinnedStatistic2dResult(medians, edges[0], edges[1], binnumbers)
|
||||
|
||||
|
||||
BinnedStatisticddResult = namedtuple('BinnedStatisticddResult',
|
||||
('statistic', 'bin_edges',
|
||||
'binnumber'))
|
||||
|
||||
|
||||
def _bincount(x, weights):
|
||||
if np.iscomplexobj(weights):
|
||||
a = np.bincount(x, np.real(weights))
|
||||
b = np.bincount(x, np.imag(weights))
|
||||
z = a + b*1j
|
||||
|
||||
else:
|
||||
z = np.bincount(x, weights)
|
||||
return z
|
||||
|
||||
|
||||
def binned_statistic_dd(sample, values, statistic='mean',
|
||||
bins=10, range=None, expand_binnumbers=False,
|
||||
binned_statistic_result=None):
|
||||
"""
|
||||
Compute a multidimensional binned statistic for a set of data.
|
||||
|
||||
This is a generalization of a histogramdd function. A histogram divides
|
||||
the space into bins, and returns the count of the number of points in
|
||||
each bin. This function allows the computation of the sum, mean, median,
|
||||
or other statistic of the values within each bin.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sample : array_like
|
||||
Data to histogram passed as a sequence of N arrays of length D, or
|
||||
as an (N,D) array.
|
||||
values : (N,) array_like or list of (N,) array_like
|
||||
The data on which the statistic will be computed. This must be
|
||||
the same shape as `sample`, or a list of sequences - each with the
|
||||
same shape as `sample`. If `values` is such a list, the statistic
|
||||
will be computed on each independently.
|
||||
statistic : string or callable, optional
|
||||
The statistic to compute (default is 'mean').
|
||||
The following statistics are available:
|
||||
|
||||
* 'mean' : compute the mean of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'median' : compute the median of values for points within each
|
||||
bin. Empty bins will be represented by NaN.
|
||||
* 'count' : compute the count of points within each bin. This is
|
||||
identical to an unweighted histogram. `values` array is not
|
||||
referenced.
|
||||
* 'sum' : compute the sum of values for points within each bin.
|
||||
This is identical to a weighted histogram.
|
||||
* 'std' : compute the standard deviation within each bin. This
|
||||
is implicitly calculated with ddof=0. If the number of values
|
||||
within a given bin is 0 or 1, the computed standard deviation value
|
||||
will be 0 for the bin.
|
||||
* 'min' : compute the minimum of values for points within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* 'max' : compute the maximum of values for point within each bin.
|
||||
Empty bins will be represented by NaN.
|
||||
* function : a user-defined function which takes a 1D array of
|
||||
values, and outputs a single numerical statistic. This function
|
||||
will be called on the values in each bin. Empty bins will be
|
||||
represented by function([]), or NaN if this returns an error.
|
||||
|
||||
bins : sequence or positive int, optional
|
||||
The bin specification must be in one of the following forms:
|
||||
|
||||
* A sequence of arrays describing the bin edges along each dimension.
|
||||
* The number of bins for each dimension (nx, ny, ... = bins).
|
||||
* The number of bins for all dimensions (nx = ny = ... = bins).
|
||||
range : sequence, optional
|
||||
A sequence of lower and upper bin edges to be used if the edges are
|
||||
not given explicitly in `bins`. Defaults to the minimum and maximum
|
||||
values along each dimension.
|
||||
expand_binnumbers : bool, optional
|
||||
'False' (default): the returned `binnumber` is a shape (N,) array of
|
||||
linearized bin indices.
|
||||
'True': the returned `binnumber` is 'unraveled' into a shape (D,N)
|
||||
ndarray, where each row gives the bin numbers in the corresponding
|
||||
dimension.
|
||||
See the `binnumber` returned value, and the `Examples` section of
|
||||
`binned_statistic_2d`.
|
||||
binned_statistic_result : binnedStatisticddResult
|
||||
Result of a previous call to the function in order to reuse bin edges
|
||||
and bin numbers with new values and/or a different statistic.
|
||||
To reuse bin numbers, `expand_binnumbers` must have been set to False
|
||||
(the default)
|
||||
|
||||
.. versionadded:: 0.17.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
statistic : ndarray, shape(nx1, nx2, nx3,...)
|
||||
The values of the selected statistic in each two-dimensional bin.
|
||||
bin_edges : list of ndarrays
|
||||
A list of D arrays describing the (nxi + 1) bin edges for each
|
||||
dimension.
|
||||
binnumber : (N,) array of ints or (D,N) ndarray of ints
|
||||
This assigns to each element of `sample` an integer that represents the
|
||||
bin in which this observation falls. The representation depends on the
|
||||
`expand_binnumbers` argument. See `Notes` for details.
|
||||
|
||||
|
||||
See Also
|
||||
--------
|
||||
numpy.digitize, numpy.histogramdd, binned_statistic, binned_statistic_2d
|
||||
|
||||
Notes
|
||||
-----
|
||||
Binedges:
|
||||
All but the last (righthand-most) bin is half-open in each dimension. In
|
||||
other words, if `bins` is ``[1, 2, 3, 4]``, then the first bin is
|
||||
``[1, 2)`` (including 1, but excluding 2) and the second ``[2, 3)``. The
|
||||
last bin, however, is ``[3, 4]``, which *includes* 4.
|
||||
|
||||
`binnumber`:
|
||||
This returned argument assigns to each element of `sample` an integer that
|
||||
represents the bin in which it belongs. The representation depends on the
|
||||
`expand_binnumbers` argument. If 'False' (default): The returned
|
||||
`binnumber` is a shape (N,) array of linearized indices mapping each
|
||||
element of `sample` to its corresponding bin (using row-major ordering).
|
||||
If 'True': The returned `binnumber` is a shape (D,N) ndarray where
|
||||
each row indicates bin placements for each dimension respectively. In each
|
||||
dimension, a binnumber of `i` means the corresponding value is between
|
||||
(bin_edges[D][i-1], bin_edges[D][i]), for each dimension 'D'.
|
||||
|
||||
.. versionadded:: 0.11.0
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> from mpl_toolkits.mplot3d import Axes3D
|
||||
|
||||
Take an array of 600 (x, y) coordinates as an example.
|
||||
`binned_statistic_dd` can handle arrays of higher dimension `D`. But a plot
|
||||
of dimension `D+1` is required.
|
||||
|
||||
>>> mu = np.array([0., 1.])
|
||||
>>> sigma = np.array([[1., -0.5],[-0.5, 1.5]])
|
||||
>>> multinormal = stats.multivariate_normal(mu, sigma)
|
||||
>>> data = multinormal.rvs(size=600, random_state=235412)
|
||||
>>> data.shape
|
||||
(600, 2)
|
||||
|
||||
Create bins and count how many arrays fall in each bin:
|
||||
|
||||
>>> N = 60
|
||||
>>> x = np.linspace(-3, 3, N)
|
||||
>>> y = np.linspace(-3, 4, N)
|
||||
>>> ret = stats.binned_statistic_dd(data, np.arange(600), bins=[x, y],
|
||||
... statistic='count')
|
||||
>>> bincounts = ret.statistic
|
||||
|
||||
Set the volume and the location of bars:
|
||||
|
||||
>>> dx = x[1] - x[0]
|
||||
>>> dy = y[1] - y[0]
|
||||
>>> x, y = np.meshgrid(x[:-1]+dx/2, y[:-1]+dy/2)
|
||||
>>> z = 0
|
||||
|
||||
>>> bincounts = bincounts.ravel()
|
||||
>>> x = x.ravel()
|
||||
>>> y = y.ravel()
|
||||
|
||||
>>> fig = plt.figure()
|
||||
>>> ax = fig.add_subplot(111, projection='3d')
|
||||
>>> with np.errstate(divide='ignore'): # silence random axes3d warning
|
||||
... ax.bar3d(x, y, z, dx, dy, bincounts)
|
||||
|
||||
Reuse bin numbers and bin edges with new values:
|
||||
|
||||
>>> ret2 = stats.binned_statistic_dd(data, -np.arange(600),
|
||||
... binned_statistic_result=ret,
|
||||
... statistic='mean')
|
||||
"""
|
||||
known_stats = ['mean', 'median', 'count', 'sum', 'std', 'min', 'max']
|
||||
if not callable(statistic) and statistic not in known_stats:
|
||||
raise ValueError(f'invalid statistic {statistic!r}')
|
||||
|
||||
try:
|
||||
bins = index(bins)
|
||||
except TypeError:
|
||||
# bins is not an integer
|
||||
pass
|
||||
# If bins was an integer-like object, now it is an actual Python int.
|
||||
|
||||
# NOTE: for _bin_edges(), see e.g. gh-11365
|
||||
if isinstance(bins, int) and not np.isfinite(sample).all():
|
||||
raise ValueError(f'{sample!r} contains non-finite values.')
|
||||
|
||||
# `Ndim` is the number of dimensions (e.g. `2` for `binned_statistic_2d`)
|
||||
# `Dlen` is the length of elements along each dimension.
|
||||
# This code is based on np.histogramdd
|
||||
try:
|
||||
# `sample` is an ND-array.
|
||||
Dlen, Ndim = sample.shape
|
||||
except (AttributeError, ValueError):
|
||||
# `sample` is a sequence of 1D arrays.
|
||||
sample = np.atleast_2d(sample).T
|
||||
Dlen, Ndim = sample.shape
|
||||
|
||||
# Store initial shape of `values` to preserve it in the output
|
||||
values = np.asarray(values)
|
||||
input_shape = list(values.shape)
|
||||
# Make sure that `values` is 2D to iterate over rows
|
||||
values = np.atleast_2d(values)
|
||||
Vdim, Vlen = values.shape
|
||||
|
||||
# Make sure `values` match `sample`
|
||||
if statistic != 'count' and Vlen != Dlen:
|
||||
raise AttributeError('The number of `values` elements must match the '
|
||||
'length of each `sample` dimension.')
|
||||
|
||||
try:
|
||||
M = len(bins)
|
||||
if M != Ndim:
|
||||
raise AttributeError('The dimension of bins must be equal '
|
||||
'to the dimension of the sample x.')
|
||||
except TypeError:
|
||||
bins = Ndim * [bins]
|
||||
|
||||
if binned_statistic_result is None:
|
||||
nbin, edges, dedges = _bin_edges(sample, bins, range)
|
||||
binnumbers = _bin_numbers(sample, nbin, edges, dedges)
|
||||
else:
|
||||
edges = binned_statistic_result.bin_edges
|
||||
nbin = np.array([len(edges[i]) + 1 for i in builtins.range(Ndim)])
|
||||
# +1 for outlier bins
|
||||
dedges = [np.diff(edges[i]) for i in builtins.range(Ndim)]
|
||||
binnumbers = binned_statistic_result.binnumber
|
||||
|
||||
# Avoid overflow with double precision. Complex `values` -> `complex128`.
|
||||
result_type = np.result_type(values, np.float64)
|
||||
result = np.empty([Vdim, nbin.prod()], dtype=result_type)
|
||||
|
||||
if statistic in {'mean', np.mean}:
|
||||
result.fill(np.nan)
|
||||
flatcount = _bincount(binnumbers, None)
|
||||
a = flatcount.nonzero()
|
||||
for vv in builtins.range(Vdim):
|
||||
flatsum = _bincount(binnumbers, values[vv])
|
||||
result[vv, a] = flatsum[a] / flatcount[a]
|
||||
elif statistic in {'std', np.std}:
|
||||
result.fill(np.nan)
|
||||
flatcount = _bincount(binnumbers, None)
|
||||
a = flatcount.nonzero()
|
||||
for vv in builtins.range(Vdim):
|
||||
flatsum = _bincount(binnumbers, values[vv])
|
||||
delta = values[vv] - flatsum[binnumbers] / flatcount[binnumbers]
|
||||
std = np.sqrt(
|
||||
_bincount(binnumbers, delta*np.conj(delta))[a] / flatcount[a]
|
||||
)
|
||||
result[vv, a] = std
|
||||
result = np.real(result)
|
||||
elif statistic == 'count':
|
||||
result = np.empty([Vdim, nbin.prod()], dtype=np.float64)
|
||||
result.fill(0)
|
||||
flatcount = _bincount(binnumbers, None)
|
||||
a = np.arange(len(flatcount))
|
||||
result[:, a] = flatcount[np.newaxis, :]
|
||||
elif statistic in {'sum', np.sum}:
|
||||
result.fill(0)
|
||||
for vv in builtins.range(Vdim):
|
||||
flatsum = _bincount(binnumbers, values[vv])
|
||||
a = np.arange(len(flatsum))
|
||||
result[vv, a] = flatsum
|
||||
elif statistic in {'median', np.median}:
|
||||
result.fill(np.nan)
|
||||
for vv in builtins.range(Vdim):
|
||||
i = np.lexsort((values[vv], binnumbers))
|
||||
_, j, counts = np.unique(binnumbers[i],
|
||||
return_index=True, return_counts=True)
|
||||
mid = j + (counts - 1) / 2
|
||||
mid_a = values[vv, i][np.floor(mid).astype(int)]
|
||||
mid_b = values[vv, i][np.ceil(mid).astype(int)]
|
||||
medians = (mid_a + mid_b) / 2
|
||||
result[vv, binnumbers[i][j]] = medians
|
||||
elif statistic in {'min', np.min}:
|
||||
result.fill(np.nan)
|
||||
for vv in builtins.range(Vdim):
|
||||
i = np.argsort(values[vv])[::-1] # Reversed so the min is last
|
||||
result[vv, binnumbers[i]] = values[vv, i]
|
||||
elif statistic in {'max', np.max}:
|
||||
result.fill(np.nan)
|
||||
for vv in builtins.range(Vdim):
|
||||
i = np.argsort(values[vv])
|
||||
result[vv, binnumbers[i]] = values[vv, i]
|
||||
elif callable(statistic):
|
||||
with np.errstate(invalid='ignore'), catch_warnings():
|
||||
simplefilter("ignore", RuntimeWarning)
|
||||
try:
|
||||
null = statistic([])
|
||||
except Exception:
|
||||
null = np.nan
|
||||
if np.iscomplexobj(null):
|
||||
result = result.astype(np.complex128)
|
||||
result.fill(null)
|
||||
try:
|
||||
_calc_binned_statistic(
|
||||
Vdim, binnumbers, result, values, statistic
|
||||
)
|
||||
except ValueError:
|
||||
result = result.astype(np.complex128)
|
||||
_calc_binned_statistic(
|
||||
Vdim, binnumbers, result, values, statistic
|
||||
)
|
||||
|
||||
# Shape into a proper matrix
|
||||
result = result.reshape(np.append(Vdim, nbin))
|
||||
|
||||
# Remove outliers (indices 0 and -1 for each bin-dimension).
|
||||
core = tuple([slice(None)] + Ndim * [slice(1, -1)])
|
||||
result = result[core]
|
||||
|
||||
# Unravel binnumbers into an ndarray, each row the bins for each dimension
|
||||
if expand_binnumbers and Ndim > 1:
|
||||
binnumbers = np.asarray(np.unravel_index(binnumbers, nbin))
|
||||
|
||||
if np.any(result.shape[1:] != nbin - 2):
|
||||
raise RuntimeError('Internal Shape Error')
|
||||
|
||||
# Reshape to have output (`result`) match input (`values`) shape
|
||||
result = result.reshape(input_shape[:-1] + list(nbin-2))
|
||||
|
||||
return BinnedStatisticddResult(result, edges, binnumbers)
|
||||
|
||||
|
||||
def _calc_binned_statistic(Vdim, bin_numbers, result, values, stat_func):
|
||||
unique_bin_numbers = np.unique(bin_numbers)
|
||||
for vv in builtins.range(Vdim):
|
||||
bin_map = _create_binned_data(bin_numbers, unique_bin_numbers,
|
||||
values, vv)
|
||||
for i in unique_bin_numbers:
|
||||
stat = stat_func(np.array(bin_map[i]))
|
||||
if np.iscomplexobj(stat) and not np.iscomplexobj(result):
|
||||
raise ValueError("The statistic function returns complex ")
|
||||
result[vv, i] = stat
|
||||
|
||||
|
||||
def _create_binned_data(bin_numbers, unique_bin_numbers, values, vv):
|
||||
""" Create hashmap of bin ids to values in bins
|
||||
key: bin number
|
||||
value: list of binned data
|
||||
"""
|
||||
bin_map = dict()
|
||||
for i in unique_bin_numbers:
|
||||
bin_map[i] = []
|
||||
for i in builtins.range(len(bin_numbers)):
|
||||
bin_map[bin_numbers[i]].append(values[vv, i])
|
||||
return bin_map
|
||||
|
||||
|
||||
def _bin_edges(sample, bins=None, range=None):
|
||||
""" Create edge arrays
|
||||
"""
|
||||
Dlen, Ndim = sample.shape
|
||||
|
||||
nbin = np.empty(Ndim, int) # Number of bins in each dimension
|
||||
edges = Ndim * [None] # Bin edges for each dim (will be 2D array)
|
||||
dedges = Ndim * [None] # Spacing between edges (will be 2D array)
|
||||
|
||||
# Select range for each dimension
|
||||
# Used only if number of bins is given.
|
||||
if range is None:
|
||||
smin = np.atleast_1d(np.array(sample.min(axis=0), float))
|
||||
smax = np.atleast_1d(np.array(sample.max(axis=0), float))
|
||||
else:
|
||||
if len(range) != Ndim:
|
||||
raise ValueError(
|
||||
f"range given for {len(range)} dimensions; {Ndim} required")
|
||||
smin = np.empty(Ndim)
|
||||
smax = np.empty(Ndim)
|
||||
for i in builtins.range(Ndim):
|
||||
if range[i][1] < range[i][0]:
|
||||
raise ValueError(
|
||||
"In {}range, start must be <= stop".format(
|
||||
f"dimension {i + 1} of " if Ndim > 1 else ""))
|
||||
smin[i], smax[i] = range[i]
|
||||
|
||||
# Make sure the bins have a finite width.
|
||||
for i in builtins.range(len(smin)):
|
||||
if smin[i] == smax[i]:
|
||||
smin[i] = smin[i] - .5
|
||||
smax[i] = smax[i] + .5
|
||||
|
||||
# Preserve sample floating point precision in bin edges
|
||||
edges_dtype = (sample.dtype if np.issubdtype(sample.dtype, np.floating)
|
||||
else float)
|
||||
|
||||
# Create edge arrays
|
||||
for i in builtins.range(Ndim):
|
||||
if np.isscalar(bins[i]):
|
||||
nbin[i] = bins[i] + 2 # +2 for outlier bins
|
||||
edges[i] = np.linspace(smin[i], smax[i], nbin[i] - 1,
|
||||
dtype=edges_dtype)
|
||||
else:
|
||||
edges[i] = np.asarray(bins[i], edges_dtype)
|
||||
nbin[i] = len(edges[i]) + 1 # +1 for outlier bins
|
||||
dedges[i] = np.diff(edges[i])
|
||||
|
||||
nbin = np.asarray(nbin)
|
||||
|
||||
return nbin, edges, dedges
|
||||
|
||||
|
||||
def _bin_numbers(sample, nbin, edges, dedges):
|
||||
"""Compute the bin number each sample falls into, in each dimension
|
||||
"""
|
||||
Dlen, Ndim = sample.shape
|
||||
|
||||
sampBin = [
|
||||
np.digitize(sample[:, i], edges[i])
|
||||
for i in range(Ndim)
|
||||
]
|
||||
|
||||
# Using `digitize`, values that fall on an edge are put in the right bin.
|
||||
# For the rightmost bin, we want values equal to the right
|
||||
# edge to be counted in the last bin, and not as an outlier.
|
||||
for i in range(Ndim):
|
||||
# Find the rounding precision
|
||||
dedges_min = dedges[i].min()
|
||||
if dedges_min == 0:
|
||||
raise ValueError('The smallest edge difference is numerically 0.')
|
||||
decimal = int(-np.log10(dedges_min)) + 6
|
||||
# Find which points are on the rightmost edge.
|
||||
on_edge = np.where((sample[:, i] >= edges[i][-1]) &
|
||||
(np.around(sample[:, i], decimal) ==
|
||||
np.around(edges[i][-1], decimal)))[0]
|
||||
# Shift these points one bin to the left.
|
||||
sampBin[i][on_edge] -= 1
|
||||
|
||||
# Compute the sample indices in the flattened statistic matrix.
|
||||
binnumbers = np.ravel_multi_index(sampBin, nbin)
|
||||
|
||||
return binnumbers
|
||||
375
venv/lib/python3.12/site-packages/scipy/stats/_binomtest.py
Normal file
375
venv/lib/python3.12/site-packages/scipy/stats/_binomtest.py
Normal file
@ -0,0 +1,375 @@
|
||||
from math import sqrt
|
||||
import numpy as np
|
||||
from scipy._lib._util import _validate_int
|
||||
from scipy.optimize import brentq
|
||||
from scipy.special import ndtri
|
||||
from ._discrete_distns import binom
|
||||
from ._common import ConfidenceInterval
|
||||
|
||||
|
||||
class BinomTestResult:
|
||||
"""
|
||||
Result of `scipy.stats.binomtest`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
k : int
|
||||
The number of successes (copied from `binomtest` input).
|
||||
n : int
|
||||
The number of trials (copied from `binomtest` input).
|
||||
alternative : str
|
||||
Indicates the alternative hypothesis specified in the input
|
||||
to `binomtest`. It will be one of ``'two-sided'``, ``'greater'``,
|
||||
or ``'less'``.
|
||||
statistic: float
|
||||
The estimate of the proportion of successes.
|
||||
pvalue : float
|
||||
The p-value of the hypothesis test.
|
||||
|
||||
"""
|
||||
def __init__(self, k, n, alternative, statistic, pvalue):
|
||||
self.k = k
|
||||
self.n = n
|
||||
self.alternative = alternative
|
||||
self.statistic = statistic
|
||||
self.pvalue = pvalue
|
||||
|
||||
# add alias for backward compatibility
|
||||
self.proportion_estimate = statistic
|
||||
|
||||
def __repr__(self):
|
||||
s = ("BinomTestResult("
|
||||
f"k={self.k}, "
|
||||
f"n={self.n}, "
|
||||
f"alternative={self.alternative!r}, "
|
||||
f"statistic={self.statistic}, "
|
||||
f"pvalue={self.pvalue})")
|
||||
return s
|
||||
|
||||
def proportion_ci(self, confidence_level=0.95, method='exact'):
|
||||
"""
|
||||
Compute the confidence interval for ``statistic``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
confidence_level : float, optional
|
||||
Confidence level for the computed confidence interval
|
||||
of the estimated proportion. Default is 0.95.
|
||||
method : {'exact', 'wilson', 'wilsoncc'}, optional
|
||||
Selects the method used to compute the confidence interval
|
||||
for the estimate of the proportion:
|
||||
|
||||
'exact' :
|
||||
Use the Clopper-Pearson exact method [1]_.
|
||||
'wilson' :
|
||||
Wilson's method, without continuity correction ([2]_, [3]_).
|
||||
'wilsoncc' :
|
||||
Wilson's method, with continuity correction ([2]_, [3]_).
|
||||
|
||||
Default is ``'exact'``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ci : ``ConfidenceInterval`` object
|
||||
The object has attributes ``low`` and ``high`` that hold the
|
||||
lower and upper bounds of the confidence interval.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] C. J. Clopper and E. S. Pearson, The use of confidence or
|
||||
fiducial limits illustrated in the case of the binomial,
|
||||
Biometrika, Vol. 26, No. 4, pp 404-413 (Dec. 1934).
|
||||
.. [2] E. B. Wilson, Probable inference, the law of succession, and
|
||||
statistical inference, J. Amer. Stat. Assoc., 22, pp 209-212
|
||||
(1927).
|
||||
.. [3] Robert G. Newcombe, Two-sided confidence intervals for the
|
||||
single proportion: comparison of seven methods, Statistics
|
||||
in Medicine, 17, pp 857-872 (1998).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.stats import binomtest
|
||||
>>> result = binomtest(k=7, n=50, p=0.1)
|
||||
>>> result.statistic
|
||||
0.14
|
||||
>>> result.proportion_ci()
|
||||
ConfidenceInterval(low=0.05819170033997342, high=0.26739600249700846)
|
||||
"""
|
||||
if method not in ('exact', 'wilson', 'wilsoncc'):
|
||||
raise ValueError(f"method ('{method}') must be one of 'exact', "
|
||||
"'wilson' or 'wilsoncc'.")
|
||||
if not (0 <= confidence_level <= 1):
|
||||
raise ValueError(f'confidence_level ({confidence_level}) must be in '
|
||||
'the interval [0, 1].')
|
||||
if method == 'exact':
|
||||
low, high = _binom_exact_conf_int(self.k, self.n,
|
||||
confidence_level,
|
||||
self.alternative)
|
||||
else:
|
||||
# method is 'wilson' or 'wilsoncc'
|
||||
low, high = _binom_wilson_conf_int(self.k, self.n,
|
||||
confidence_level,
|
||||
self.alternative,
|
||||
correction=method == 'wilsoncc')
|
||||
return ConfidenceInterval(low=low, high=high)
|
||||
|
||||
|
||||
def _findp(func):
|
||||
try:
|
||||
p = brentq(func, 0, 1)
|
||||
except RuntimeError:
|
||||
raise RuntimeError('numerical solver failed to converge when '
|
||||
'computing the confidence limits') from None
|
||||
except ValueError as exc:
|
||||
raise ValueError('brentq raised a ValueError; report this to the '
|
||||
'SciPy developers') from exc
|
||||
return p
|
||||
|
||||
|
||||
def _binom_exact_conf_int(k, n, confidence_level, alternative):
|
||||
"""
|
||||
Compute the estimate and confidence interval for the binomial test.
|
||||
|
||||
Returns proportion, prop_low, prop_high
|
||||
"""
|
||||
if alternative == 'two-sided':
|
||||
alpha = (1 - confidence_level) / 2
|
||||
if k == 0:
|
||||
plow = 0.0
|
||||
else:
|
||||
plow = _findp(lambda p: binom.sf(k-1, n, p) - alpha)
|
||||
if k == n:
|
||||
phigh = 1.0
|
||||
else:
|
||||
phigh = _findp(lambda p: binom.cdf(k, n, p) - alpha)
|
||||
elif alternative == 'less':
|
||||
alpha = 1 - confidence_level
|
||||
plow = 0.0
|
||||
if k == n:
|
||||
phigh = 1.0
|
||||
else:
|
||||
phigh = _findp(lambda p: binom.cdf(k, n, p) - alpha)
|
||||
elif alternative == 'greater':
|
||||
alpha = 1 - confidence_level
|
||||
if k == 0:
|
||||
plow = 0.0
|
||||
else:
|
||||
plow = _findp(lambda p: binom.sf(k-1, n, p) - alpha)
|
||||
phigh = 1.0
|
||||
return plow, phigh
|
||||
|
||||
|
||||
def _binom_wilson_conf_int(k, n, confidence_level, alternative, correction):
|
||||
# This function assumes that the arguments have already been validated.
|
||||
# In particular, `alternative` must be one of 'two-sided', 'less' or
|
||||
# 'greater'.
|
||||
p = k / n
|
||||
if alternative == 'two-sided':
|
||||
z = ndtri(0.5 + 0.5*confidence_level)
|
||||
else:
|
||||
z = ndtri(confidence_level)
|
||||
|
||||
# For reference, the formulas implemented here are from
|
||||
# Newcombe (1998) (ref. [3] in the proportion_ci docstring).
|
||||
denom = 2*(n + z**2)
|
||||
center = (2*n*p + z**2)/denom
|
||||
q = 1 - p
|
||||
if correction:
|
||||
if alternative == 'less' or k == 0:
|
||||
lo = 0.0
|
||||
else:
|
||||
dlo = (1 + z*sqrt(z**2 - 2 - 1/n + 4*p*(n*q + 1))) / denom
|
||||
lo = center - dlo
|
||||
if alternative == 'greater' or k == n:
|
||||
hi = 1.0
|
||||
else:
|
||||
dhi = (1 + z*sqrt(z**2 + 2 - 1/n + 4*p*(n*q - 1))) / denom
|
||||
hi = center + dhi
|
||||
else:
|
||||
delta = z/denom * sqrt(4*n*p*q + z**2)
|
||||
if alternative == 'less' or k == 0:
|
||||
lo = 0.0
|
||||
else:
|
||||
lo = center - delta
|
||||
if alternative == 'greater' or k == n:
|
||||
hi = 1.0
|
||||
else:
|
||||
hi = center + delta
|
||||
|
||||
return lo, hi
|
||||
|
||||
|
||||
def binomtest(k, n, p=0.5, alternative='two-sided'):
|
||||
"""
|
||||
Perform a test that the probability of success is p.
|
||||
|
||||
The binomial test [1]_ is a test of the null hypothesis that the
|
||||
probability of success in a Bernoulli experiment is `p`.
|
||||
|
||||
Details of the test can be found in many texts on statistics, such
|
||||
as section 24.5 of [2]_.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
k : int
|
||||
The number of successes.
|
||||
n : int
|
||||
The number of trials.
|
||||
p : float, optional
|
||||
The hypothesized probability of success, i.e. the expected
|
||||
proportion of successes. The value must be in the interval
|
||||
``0 <= p <= 1``. The default value is ``p = 0.5``.
|
||||
alternative : {'two-sided', 'greater', 'less'}, optional
|
||||
Indicates the alternative hypothesis. The default value is
|
||||
'two-sided'.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : `~scipy.stats._result_classes.BinomTestResult` instance
|
||||
The return value is an object with the following attributes:
|
||||
|
||||
k : int
|
||||
The number of successes (copied from `binomtest` input).
|
||||
n : int
|
||||
The number of trials (copied from `binomtest` input).
|
||||
alternative : str
|
||||
Indicates the alternative hypothesis specified in the input
|
||||
to `binomtest`. It will be one of ``'two-sided'``, ``'greater'``,
|
||||
or ``'less'``.
|
||||
statistic : float
|
||||
The estimate of the proportion of successes.
|
||||
pvalue : float
|
||||
The p-value of the hypothesis test.
|
||||
|
||||
The object has the following methods:
|
||||
|
||||
proportion_ci(confidence_level=0.95, method='exact') :
|
||||
Compute the confidence interval for ``statistic``.
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. versionadded:: 1.7.0
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Binomial test, https://en.wikipedia.org/wiki/Binomial_test
|
||||
.. [2] Jerrold H. Zar, Biostatistical Analysis (fifth edition),
|
||||
Prentice Hall, Upper Saddle River, New Jersey USA (2010)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.stats import binomtest
|
||||
|
||||
A car manufacturer claims that no more than 10% of their cars are unsafe.
|
||||
15 cars are inspected for safety, 3 were found to be unsafe. Test the
|
||||
manufacturer's claim:
|
||||
|
||||
>>> result = binomtest(3, n=15, p=0.1, alternative='greater')
|
||||
>>> result.pvalue
|
||||
0.18406106910639114
|
||||
|
||||
The null hypothesis cannot be rejected at the 5% level of significance
|
||||
because the returned p-value is greater than the critical value of 5%.
|
||||
|
||||
The test statistic is equal to the estimated proportion, which is simply
|
||||
``3/15``:
|
||||
|
||||
>>> result.statistic
|
||||
0.2
|
||||
|
||||
We can use the `proportion_ci()` method of the result to compute the
|
||||
confidence interval of the estimate:
|
||||
|
||||
>>> result.proportion_ci(confidence_level=0.95)
|
||||
ConfidenceInterval(low=0.05684686759024681, high=1.0)
|
||||
|
||||
"""
|
||||
k = _validate_int(k, 'k', minimum=0)
|
||||
n = _validate_int(n, 'n', minimum=1)
|
||||
if k > n:
|
||||
raise ValueError(f'k ({k}) must not be greater than n ({n}).')
|
||||
|
||||
if not (0 <= p <= 1):
|
||||
raise ValueError(f"p ({p}) must be in range [0,1]")
|
||||
|
||||
if alternative not in ('two-sided', 'less', 'greater'):
|
||||
raise ValueError(f"alternative ('{alternative}') not recognized; \n"
|
||||
"must be 'two-sided', 'less' or 'greater'")
|
||||
if alternative == 'less':
|
||||
pval = binom.cdf(k, n, p)
|
||||
elif alternative == 'greater':
|
||||
pval = binom.sf(k-1, n, p)
|
||||
else:
|
||||
# alternative is 'two-sided'
|
||||
d = binom.pmf(k, n, p)
|
||||
rerr = 1 + 1e-7
|
||||
if k == p * n:
|
||||
# special case as shortcut, would also be handled by `else` below
|
||||
pval = 1.
|
||||
elif k < p * n:
|
||||
ix = _binary_search_for_binom_tst(lambda x1: -binom.pmf(x1, n, p),
|
||||
-d*rerr, np.ceil(p * n), n)
|
||||
# y is the number of terms between mode and n that are <= d*rerr.
|
||||
# ix gave us the first term where a(ix) <= d*rerr < a(ix-1)
|
||||
# if the first equality doesn't hold, y=n-ix. Otherwise, we
|
||||
# need to include ix as well as the equality holds. Note that
|
||||
# the equality will hold in very very rare situations due to rerr.
|
||||
y = n - ix + int(d*rerr == binom.pmf(ix, n, p))
|
||||
pval = binom.cdf(k, n, p) + binom.sf(n - y, n, p)
|
||||
else:
|
||||
ix = _binary_search_for_binom_tst(lambda x1: binom.pmf(x1, n, p),
|
||||
d*rerr, 0, np.floor(p * n))
|
||||
# y is the number of terms between 0 and mode that are <= d*rerr.
|
||||
# we need to add a 1 to account for the 0 index.
|
||||
# For comparing this with old behavior, see
|
||||
# tst_binary_srch_for_binom_tst method in test_morestats.
|
||||
y = ix + 1
|
||||
pval = binom.cdf(y-1, n, p) + binom.sf(k-1, n, p)
|
||||
|
||||
pval = min(1.0, pval)
|
||||
|
||||
result = BinomTestResult(k=k, n=n, alternative=alternative,
|
||||
statistic=k/n, pvalue=pval)
|
||||
return result
|
||||
|
||||
|
||||
def _binary_search_for_binom_tst(a, d, lo, hi):
|
||||
"""
|
||||
Conducts an implicit binary search on a function specified by `a`.
|
||||
|
||||
Meant to be used on the binomial PMF for the case of two-sided tests
|
||||
to obtain the value on the other side of the mode where the tail
|
||||
probability should be computed. The values on either side of
|
||||
the mode are always in order, meaning binary search is applicable.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : callable
|
||||
The function over which to perform binary search. Its values
|
||||
for inputs lo and hi should be in ascending order.
|
||||
d : float
|
||||
The value to search.
|
||||
lo : int
|
||||
The lower end of range to search.
|
||||
hi : int
|
||||
The higher end of the range to search.
|
||||
|
||||
Returns
|
||||
-------
|
||||
int
|
||||
The index, i between lo and hi
|
||||
such that a(i)<=d<a(i+1)
|
||||
"""
|
||||
while lo < hi:
|
||||
mid = lo + (hi-lo)//2
|
||||
midval = a(mid)
|
||||
if midval < d:
|
||||
lo = mid+1
|
||||
elif midval > d:
|
||||
hi = mid-1
|
||||
else:
|
||||
return mid
|
||||
if a(lo) <= d:
|
||||
return lo
|
||||
else:
|
||||
return lo-1
|
||||
177
venv/lib/python3.12/site-packages/scipy/stats/_bws_test.py
Normal file
177
venv/lib/python3.12/site-packages/scipy/stats/_bws_test.py
Normal file
@ -0,0 +1,177 @@
|
||||
import numpy as np
|
||||
from functools import partial
|
||||
from scipy import stats
|
||||
|
||||
|
||||
def _bws_input_validation(x, y, alternative, method):
|
||||
''' Input validation and standardization for bws test'''
|
||||
x, y = np.atleast_1d(x, y)
|
||||
if x.ndim > 1 or y.ndim > 1:
|
||||
raise ValueError('`x` and `y` must be exactly one-dimensional.')
|
||||
if np.isnan(x).any() or np.isnan(y).any():
|
||||
raise ValueError('`x` and `y` must not contain NaNs.')
|
||||
if np.size(x) == 0 or np.size(y) == 0:
|
||||
raise ValueError('`x` and `y` must be of nonzero size.')
|
||||
|
||||
z = stats.rankdata(np.concatenate((x, y)))
|
||||
x, y = z[:len(x)], z[len(x):]
|
||||
|
||||
alternatives = {'two-sided', 'less', 'greater'}
|
||||
alternative = alternative.lower()
|
||||
if alternative not in alternatives:
|
||||
raise ValueError(f'`alternative` must be one of {alternatives}.')
|
||||
|
||||
method = stats.PermutationMethod() if method is None else method
|
||||
if not isinstance(method, stats.PermutationMethod):
|
||||
raise ValueError('`method` must be an instance of '
|
||||
'`scipy.stats.PermutationMethod`')
|
||||
|
||||
return x, y, alternative, method
|
||||
|
||||
|
||||
def _bws_statistic(x, y, alternative, axis):
|
||||
'''Compute the BWS test statistic for two independent samples'''
|
||||
# Public function currently does not accept `axis`, but `permutation_test`
|
||||
# uses `axis` to make vectorized call.
|
||||
|
||||
Ri, Hj = np.sort(x, axis=axis), np.sort(y, axis=axis)
|
||||
n, m = Ri.shape[axis], Hj.shape[axis]
|
||||
i, j = np.arange(1, n+1), np.arange(1, m+1)
|
||||
|
||||
Bx_num = Ri - (m + n)/n * i
|
||||
By_num = Hj - (m + n)/m * j
|
||||
|
||||
if alternative == 'two-sided':
|
||||
Bx_num *= Bx_num
|
||||
By_num *= By_num
|
||||
else:
|
||||
Bx_num *= np.abs(Bx_num)
|
||||
By_num *= np.abs(By_num)
|
||||
|
||||
Bx_den = i/(n+1) * (1 - i/(n+1)) * m*(m+n)/n
|
||||
By_den = j/(m+1) * (1 - j/(m+1)) * n*(m+n)/m
|
||||
|
||||
Bx = 1/n * np.sum(Bx_num/Bx_den, axis=axis)
|
||||
By = 1/m * np.sum(By_num/By_den, axis=axis)
|
||||
|
||||
B = (Bx + By) / 2 if alternative == 'two-sided' else (Bx - By) / 2
|
||||
|
||||
return B
|
||||
|
||||
|
||||
def bws_test(x, y, *, alternative="two-sided", method=None):
|
||||
r'''Perform the Baumgartner-Weiss-Schindler test on two independent samples.
|
||||
|
||||
The Baumgartner-Weiss-Schindler (BWS) test is a nonparametric test of
|
||||
the null hypothesis that the distribution underlying sample `x`
|
||||
is the same as the distribution underlying sample `y`. Unlike
|
||||
the Kolmogorov-Smirnov, Wilcoxon, and Cramer-Von Mises tests,
|
||||
the BWS test weights the integral by the variance of the difference
|
||||
in cumulative distribution functions (CDFs), emphasizing the tails of the
|
||||
distributions, which increases the power of the test in many applications.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x, y : array-like
|
||||
1-d arrays of samples.
|
||||
alternative : {'two-sided', 'less', 'greater'}, optional
|
||||
Defines the alternative hypothesis. Default is 'two-sided'.
|
||||
Let *F(u)* and *G(u)* be the cumulative distribution functions of the
|
||||
distributions underlying `x` and `y`, respectively. Then the following
|
||||
alternative hypotheses are available:
|
||||
|
||||
* 'two-sided': the distributions are not equal, i.e. *F(u) ≠ G(u)* for
|
||||
at least one *u*.
|
||||
* 'less': the distribution underlying `x` is stochastically less than
|
||||
the distribution underlying `y`, i.e. *F(u) >= G(u)* for all *u*.
|
||||
* 'greater': the distribution underlying `x` is stochastically greater
|
||||
than the distribution underlying `y`, i.e. *F(u) <= G(u)* for all
|
||||
*u*.
|
||||
|
||||
Under a more restrictive set of assumptions, the alternative hypotheses
|
||||
can be expressed in terms of the locations of the distributions;
|
||||
see [2] section 5.1.
|
||||
method : PermutationMethod, optional
|
||||
Configures the method used to compute the p-value. The default is
|
||||
the default `PermutationMethod` object.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : PermutationTestResult
|
||||
An object with attributes:
|
||||
|
||||
statistic : float
|
||||
The observed test statistic of the data.
|
||||
pvalue : float
|
||||
The p-value for the given alternative.
|
||||
null_distribution : ndarray
|
||||
The values of the test statistic generated under the null hypothesis.
|
||||
|
||||
See also
|
||||
--------
|
||||
scipy.stats.wilcoxon, scipy.stats.mannwhitneyu, scipy.stats.ttest_ind
|
||||
|
||||
Notes
|
||||
-----
|
||||
When ``alternative=='two-sided'``, the statistic is defined by the
|
||||
equations given in [1]_ Section 2. This statistic is not appropriate for
|
||||
one-sided alternatives; in that case, the statistic is the *negative* of
|
||||
that given by the equations in [1]_ Section 2. Consequently, when the
|
||||
distribution of the first sample is stochastically greater than that of the
|
||||
second sample, the statistic will tend to be positive.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Neuhäuser, M. (2005). Exact Tests Based on the
|
||||
Baumgartner-Weiss-Schindler Statistic: A Survey. Statistical Papers,
|
||||
46(1), 1-29.
|
||||
.. [2] Fay, M. P., & Proschan, M. A. (2010). Wilcoxon-Mann-Whitney or t-test?
|
||||
On assumptions for hypothesis tests and multiple interpretations of
|
||||
decision rules. Statistics surveys, 4, 1.
|
||||
|
||||
Examples
|
||||
--------
|
||||
We follow the example of table 3 in [1]_: Fourteen children were divided
|
||||
randomly into two groups. Their ranks at performing a specific tests are
|
||||
as follows.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> x = [1, 2, 3, 4, 6, 7, 8]
|
||||
>>> y = [5, 9, 10, 11, 12, 13, 14]
|
||||
|
||||
We use the BWS test to assess whether there is a statistically significant
|
||||
difference between the two groups.
|
||||
The null hypothesis is that there is no difference in the distributions of
|
||||
performance between the two groups. We decide that a significance level of
|
||||
1% is required to reject the null hypothesis in favor of the alternative
|
||||
that the distributions are different.
|
||||
Since the number of samples is very small, we can compare the observed test
|
||||
statistic against the *exact* distribution of the test statistic under the
|
||||
null hypothesis.
|
||||
|
||||
>>> from scipy.stats import bws_test
|
||||
>>> res = bws_test(x, y)
|
||||
>>> print(res.statistic)
|
||||
5.132167152575315
|
||||
|
||||
This agrees with :math:`B = 5.132` reported in [1]_. The *p*-value produced
|
||||
by `bws_test` also agrees with :math:`p = 0.0029` reported in [1]_.
|
||||
|
||||
>>> print(res.pvalue)
|
||||
0.002913752913752914
|
||||
|
||||
Because the p-value is below our threshold of 1%, we take this as evidence
|
||||
against the null hypothesis in favor of the alternative that there is a
|
||||
difference in performance between the two groups.
|
||||
'''
|
||||
|
||||
x, y, alternative, method = _bws_input_validation(x, y, alternative,
|
||||
method)
|
||||
bws_statistic = partial(_bws_statistic, alternative=alternative)
|
||||
|
||||
permutation_alternative = 'less' if alternative == 'less' else 'greater'
|
||||
res = stats.permutation_test((x, y), bws_statistic,
|
||||
alternative=permutation_alternative,
|
||||
**method._asdict())
|
||||
|
||||
return res
|
||||
459
venv/lib/python3.12/site-packages/scipy/stats/_censored_data.py
Normal file
459
venv/lib/python3.12/site-packages/scipy/stats/_censored_data.py
Normal file
@ -0,0 +1,459 @@
|
||||
import numpy as np
|
||||
|
||||
|
||||
def _validate_1d(a, name, allow_inf=False):
|
||||
if np.ndim(a) != 1:
|
||||
raise ValueError(f'`{name}` must be a one-dimensional sequence.')
|
||||
if np.isnan(a).any():
|
||||
raise ValueError(f'`{name}` must not contain nan.')
|
||||
if not allow_inf and np.isinf(a).any():
|
||||
raise ValueError(f'`{name}` must contain only finite values.')
|
||||
|
||||
|
||||
def _validate_interval(interval):
|
||||
interval = np.asarray(interval)
|
||||
if interval.shape == (0,):
|
||||
# The input was a sequence with length 0.
|
||||
interval = interval.reshape((0, 2))
|
||||
if interval.ndim != 2 or interval.shape[-1] != 2:
|
||||
raise ValueError('`interval` must be a two-dimensional array with '
|
||||
'shape (m, 2), where m is the number of '
|
||||
'interval-censored values, but got shape '
|
||||
f'{interval.shape}')
|
||||
|
||||
if np.isnan(interval).any():
|
||||
raise ValueError('`interval` must not contain nan.')
|
||||
if np.isinf(interval).all(axis=1).any():
|
||||
raise ValueError('In each row in `interval`, both values must not'
|
||||
' be infinite.')
|
||||
if (interval[:, 0] > interval[:, 1]).any():
|
||||
raise ValueError('In each row of `interval`, the left value must not'
|
||||
' exceed the right value.')
|
||||
|
||||
uncensored_mask = interval[:, 0] == interval[:, 1]
|
||||
left_mask = np.isinf(interval[:, 0])
|
||||
right_mask = np.isinf(interval[:, 1])
|
||||
interval_mask = np.isfinite(interval).all(axis=1) & ~uncensored_mask
|
||||
|
||||
uncensored2 = interval[uncensored_mask, 0]
|
||||
left2 = interval[left_mask, 1]
|
||||
right2 = interval[right_mask, 0]
|
||||
interval2 = interval[interval_mask]
|
||||
|
||||
return uncensored2, left2, right2, interval2
|
||||
|
||||
|
||||
def _validate_x_censored(x, censored):
|
||||
x = np.asarray(x)
|
||||
if x.ndim != 1:
|
||||
raise ValueError('`x` must be one-dimensional.')
|
||||
censored = np.asarray(censored)
|
||||
if censored.ndim != 1:
|
||||
raise ValueError('`censored` must be one-dimensional.')
|
||||
if (~np.isfinite(x)).any():
|
||||
raise ValueError('`x` must not contain nan or inf.')
|
||||
if censored.size != x.size:
|
||||
raise ValueError('`x` and `censored` must have the same length.')
|
||||
return x, censored.astype(bool)
|
||||
|
||||
|
||||
class CensoredData:
|
||||
"""
|
||||
Instances of this class represent censored data.
|
||||
|
||||
Instances may be passed to the ``fit`` method of continuous
|
||||
univariate SciPy distributions for maximum likelihood estimation.
|
||||
The *only* method of the univariate continuous distributions that
|
||||
understands `CensoredData` is the ``fit`` method. An instance of
|
||||
`CensoredData` can not be passed to methods such as ``pdf`` and
|
||||
``cdf``.
|
||||
|
||||
An observation is said to be *censored* when the precise value is unknown,
|
||||
but it has a known upper and/or lower bound. The conventional terminology
|
||||
is:
|
||||
|
||||
* left-censored: an observation is below a certain value but it is
|
||||
unknown by how much.
|
||||
* right-censored: an observation is above a certain value but it is
|
||||
unknown by how much.
|
||||
* interval-censored: an observation lies somewhere on an interval between
|
||||
two values.
|
||||
|
||||
Left-, right-, and interval-censored data can be represented by
|
||||
`CensoredData`.
|
||||
|
||||
For convenience, the class methods ``left_censored`` and
|
||||
``right_censored`` are provided to create a `CensoredData`
|
||||
instance from a single one-dimensional array of measurements
|
||||
and a corresponding boolean array to indicate which measurements
|
||||
are censored. The class method ``interval_censored`` accepts two
|
||||
one-dimensional arrays that hold the lower and upper bounds of the
|
||||
intervals.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
uncensored : array_like, 1D
|
||||
Uncensored observations.
|
||||
left : array_like, 1D
|
||||
Left-censored observations.
|
||||
right : array_like, 1D
|
||||
Right-censored observations.
|
||||
interval : array_like, 2D, with shape (m, 2)
|
||||
Interval-censored observations. Each row ``interval[k, :]``
|
||||
represents the interval for the kth interval-censored observation.
|
||||
|
||||
Notes
|
||||
-----
|
||||
In the input array `interval`, the lower bound of the interval may
|
||||
be ``-inf``, and the upper bound may be ``inf``, but at least one must be
|
||||
finite. When the lower bound is ``-inf``, the row represents a left-
|
||||
censored observation, and when the upper bound is ``inf``, the row
|
||||
represents a right-censored observation. If the length of an interval
|
||||
is 0 (i.e. ``interval[k, 0] == interval[k, 1]``, the observation is
|
||||
treated as uncensored. So one can represent all the types of censored
|
||||
and uncensored data in ``interval``, but it is generally more convenient
|
||||
to use `uncensored`, `left` and `right` for uncensored, left-censored and
|
||||
right-censored observations, respectively.
|
||||
|
||||
Examples
|
||||
--------
|
||||
In the most general case, a censored data set may contain values that
|
||||
are left-censored, right-censored, interval-censored, and uncensored.
|
||||
For example, here we create a data set with five observations. Two
|
||||
are uncensored (values 1 and 1.5), one is a left-censored observation
|
||||
of 0, one is a right-censored observation of 10 and one is
|
||||
interval-censored in the interval [2, 3].
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats import CensoredData
|
||||
>>> data = CensoredData(uncensored=[1, 1.5], left=[0], right=[10],
|
||||
... interval=[[2, 3]])
|
||||
>>> print(data)
|
||||
CensoredData(5 values: 2 not censored, 1 left-censored,
|
||||
1 right-censored, 1 interval-censored)
|
||||
|
||||
Equivalently,
|
||||
|
||||
>>> data = CensoredData(interval=[[1, 1],
|
||||
... [1.5, 1.5],
|
||||
... [-np.inf, 0],
|
||||
... [10, np.inf],
|
||||
... [2, 3]])
|
||||
>>> print(data)
|
||||
CensoredData(5 values: 2 not censored, 1 left-censored,
|
||||
1 right-censored, 1 interval-censored)
|
||||
|
||||
A common case is to have a mix of uncensored observations and censored
|
||||
observations that are all right-censored (or all left-censored). For
|
||||
example, consider an experiment in which six devices are started at
|
||||
various times and left running until they fail. Assume that time is
|
||||
measured in hours, and the experiment is stopped after 30 hours, even
|
||||
if all the devices have not failed by that time. We might end up with
|
||||
data such as this::
|
||||
|
||||
Device Start-time Fail-time Time-to-failure
|
||||
1 0 13 13
|
||||
2 2 24 22
|
||||
3 5 22 17
|
||||
4 8 23 15
|
||||
5 10 *** >20
|
||||
6 12 *** >18
|
||||
|
||||
Two of the devices had not failed when the experiment was stopped;
|
||||
the observations of the time-to-failure for these two devices are
|
||||
right-censored. We can represent this data with
|
||||
|
||||
>>> data = CensoredData(uncensored=[13, 22, 17, 15], right=[20, 18])
|
||||
>>> print(data)
|
||||
CensoredData(6 values: 4 not censored, 2 right-censored)
|
||||
|
||||
Alternatively, we can use the method `CensoredData.right_censored` to
|
||||
create a representation of this data. The time-to-failure observations
|
||||
are put the list ``ttf``. The ``censored`` list indicates which values
|
||||
in ``ttf`` are censored.
|
||||
|
||||
>>> ttf = [13, 22, 17, 15, 20, 18]
|
||||
>>> censored = [False, False, False, False, True, True]
|
||||
|
||||
Pass these lists to `CensoredData.right_censored` to create an
|
||||
instance of `CensoredData`.
|
||||
|
||||
>>> data = CensoredData.right_censored(ttf, censored)
|
||||
>>> print(data)
|
||||
CensoredData(6 values: 4 not censored, 2 right-censored)
|
||||
|
||||
If the input data is interval censored and already stored in two
|
||||
arrays, one holding the low end of the intervals and another
|
||||
holding the high ends, the class method ``interval_censored`` can
|
||||
be used to create the `CensoredData` instance.
|
||||
|
||||
This example creates an instance with four interval-censored values.
|
||||
The intervals are [10, 11], [0.5, 1], [2, 3], and [12.5, 13.5].
|
||||
|
||||
>>> a = [10, 0.5, 2, 12.5] # Low ends of the intervals
|
||||
>>> b = [11, 1.0, 3, 13.5] # High ends of the intervals
|
||||
>>> data = CensoredData.interval_censored(low=a, high=b)
|
||||
>>> print(data)
|
||||
CensoredData(4 values: 0 not censored, 4 interval-censored)
|
||||
|
||||
Finally, we create and censor some data from the `weibull_min`
|
||||
distribution, and then fit `weibull_min` to that data. We'll assume
|
||||
that the location parameter is known to be 0.
|
||||
|
||||
>>> from scipy.stats import weibull_min
|
||||
>>> rng = np.random.default_rng()
|
||||
|
||||
Create the random data set.
|
||||
|
||||
>>> x = weibull_min.rvs(2.5, loc=0, scale=30, size=250, random_state=rng)
|
||||
>>> x[x > 40] = 40 # Right-censor values greater or equal to 40.
|
||||
|
||||
Create the `CensoredData` instance with the `right_censored` method.
|
||||
The censored values are those where the value is 40.
|
||||
|
||||
>>> data = CensoredData.right_censored(x, x == 40)
|
||||
>>> print(data)
|
||||
CensoredData(250 values: 215 not censored, 35 right-censored)
|
||||
|
||||
35 values have been right-censored.
|
||||
|
||||
Fit `weibull_min` to the censored data. We expect to shape and scale
|
||||
to be approximately 2.5 and 30, respectively.
|
||||
|
||||
>>> weibull_min.fit(data, floc=0)
|
||||
(2.3575922823897315, 0, 30.40650074451254)
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, uncensored=None, *, left=None, right=None,
|
||||
interval=None):
|
||||
if uncensored is None:
|
||||
uncensored = []
|
||||
if left is None:
|
||||
left = []
|
||||
if right is None:
|
||||
right = []
|
||||
if interval is None:
|
||||
interval = np.empty((0, 2))
|
||||
|
||||
_validate_1d(uncensored, 'uncensored')
|
||||
_validate_1d(left, 'left')
|
||||
_validate_1d(right, 'right')
|
||||
uncensored2, left2, right2, interval2 = _validate_interval(interval)
|
||||
|
||||
self._uncensored = np.concatenate((uncensored, uncensored2))
|
||||
self._left = np.concatenate((left, left2))
|
||||
self._right = np.concatenate((right, right2))
|
||||
# Note that by construction, the private attribute _interval
|
||||
# will be a 2D array that contains only finite values representing
|
||||
# intervals with nonzero but finite length.
|
||||
self._interval = interval2
|
||||
|
||||
def __repr__(self):
|
||||
uncensored_str = " ".join(np.array_repr(self._uncensored).split())
|
||||
left_str = " ".join(np.array_repr(self._left).split())
|
||||
right_str = " ".join(np.array_repr(self._right).split())
|
||||
interval_str = " ".join(np.array_repr(self._interval).split())
|
||||
return (f"CensoredData(uncensored={uncensored_str}, left={left_str}, "
|
||||
f"right={right_str}, interval={interval_str})")
|
||||
|
||||
def __str__(self):
|
||||
num_nc = len(self._uncensored)
|
||||
num_lc = len(self._left)
|
||||
num_rc = len(self._right)
|
||||
num_ic = len(self._interval)
|
||||
n = num_nc + num_lc + num_rc + num_ic
|
||||
parts = [f'{num_nc} not censored']
|
||||
if num_lc > 0:
|
||||
parts.append(f'{num_lc} left-censored')
|
||||
if num_rc > 0:
|
||||
parts.append(f'{num_rc} right-censored')
|
||||
if num_ic > 0:
|
||||
parts.append(f'{num_ic} interval-censored')
|
||||
return f'CensoredData({n} values: ' + ', '.join(parts) + ')'
|
||||
|
||||
# This is not a complete implementation of the arithmetic operators.
|
||||
# All we need is subtracting a scalar and dividing by a scalar.
|
||||
|
||||
def __sub__(self, other):
|
||||
return CensoredData(uncensored=self._uncensored - other,
|
||||
left=self._left - other,
|
||||
right=self._right - other,
|
||||
interval=self._interval - other)
|
||||
|
||||
def __truediv__(self, other):
|
||||
return CensoredData(uncensored=self._uncensored / other,
|
||||
left=self._left / other,
|
||||
right=self._right / other,
|
||||
interval=self._interval / other)
|
||||
|
||||
def __len__(self):
|
||||
"""
|
||||
The number of values (censored and not censored).
|
||||
"""
|
||||
return (len(self._uncensored) + len(self._left) + len(self._right)
|
||||
+ len(self._interval))
|
||||
|
||||
def num_censored(self):
|
||||
"""
|
||||
Number of censored values.
|
||||
"""
|
||||
return len(self._left) + len(self._right) + len(self._interval)
|
||||
|
||||
@classmethod
|
||||
def right_censored(cls, x, censored):
|
||||
"""
|
||||
Create a `CensoredData` instance of right-censored data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : array_like
|
||||
`x` is the array of observed data or measurements.
|
||||
`x` must be a one-dimensional sequence of finite numbers.
|
||||
censored : array_like of bool
|
||||
`censored` must be a one-dimensional sequence of boolean
|
||||
values. If ``censored[k]`` is True, the corresponding value
|
||||
in `x` is right-censored. That is, the value ``x[k]``
|
||||
is the lower bound of the true (but unknown) value.
|
||||
|
||||
Returns
|
||||
-------
|
||||
data : `CensoredData`
|
||||
An instance of `CensoredData` that represents the
|
||||
collection of uncensored and right-censored values.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.stats import CensoredData
|
||||
|
||||
Two uncensored values (4 and 10) and two right-censored values
|
||||
(24 and 25).
|
||||
|
||||
>>> data = CensoredData.right_censored([4, 10, 24, 25],
|
||||
... [False, False, True, True])
|
||||
>>> data
|
||||
CensoredData(uncensored=array([ 4., 10.]),
|
||||
left=array([], dtype=float64), right=array([24., 25.]),
|
||||
interval=array([], shape=(0, 2), dtype=float64))
|
||||
>>> print(data)
|
||||
CensoredData(4 values: 2 not censored, 2 right-censored)
|
||||
"""
|
||||
x, censored = _validate_x_censored(x, censored)
|
||||
return cls(uncensored=x[~censored], right=x[censored])
|
||||
|
||||
@classmethod
|
||||
def left_censored(cls, x, censored):
|
||||
"""
|
||||
Create a `CensoredData` instance of left-censored data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : array_like
|
||||
`x` is the array of observed data or measurements.
|
||||
`x` must be a one-dimensional sequence of finite numbers.
|
||||
censored : array_like of bool
|
||||
`censored` must be a one-dimensional sequence of boolean
|
||||
values. If ``censored[k]`` is True, the corresponding value
|
||||
in `x` is left-censored. That is, the value ``x[k]``
|
||||
is the upper bound of the true (but unknown) value.
|
||||
|
||||
Returns
|
||||
-------
|
||||
data : `CensoredData`
|
||||
An instance of `CensoredData` that represents the
|
||||
collection of uncensored and left-censored values.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.stats import CensoredData
|
||||
|
||||
Two uncensored values (0.12 and 0.033) and two left-censored values
|
||||
(both 1e-3).
|
||||
|
||||
>>> data = CensoredData.left_censored([0.12, 0.033, 1e-3, 1e-3],
|
||||
... [False, False, True, True])
|
||||
>>> data
|
||||
CensoredData(uncensored=array([0.12 , 0.033]),
|
||||
left=array([0.001, 0.001]), right=array([], dtype=float64),
|
||||
interval=array([], shape=(0, 2), dtype=float64))
|
||||
>>> print(data)
|
||||
CensoredData(4 values: 2 not censored, 2 left-censored)
|
||||
"""
|
||||
x, censored = _validate_x_censored(x, censored)
|
||||
return cls(uncensored=x[~censored], left=x[censored])
|
||||
|
||||
@classmethod
|
||||
def interval_censored(cls, low, high):
|
||||
"""
|
||||
Create a `CensoredData` instance of interval-censored data.
|
||||
|
||||
This method is useful when all the data is interval-censored, and
|
||||
the low and high ends of the intervals are already stored in
|
||||
separate one-dimensional arrays.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
low : array_like
|
||||
The one-dimensional array containing the low ends of the
|
||||
intervals.
|
||||
high : array_like
|
||||
The one-dimensional array containing the high ends of the
|
||||
intervals.
|
||||
|
||||
Returns
|
||||
-------
|
||||
data : `CensoredData`
|
||||
An instance of `CensoredData` that represents the
|
||||
collection of censored values.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats import CensoredData
|
||||
|
||||
``a`` and ``b`` are the low and high ends of a collection of
|
||||
interval-censored values.
|
||||
|
||||
>>> a = [0.5, 2.0, 3.0, 5.5]
|
||||
>>> b = [1.0, 2.5, 3.5, 7.0]
|
||||
>>> data = CensoredData.interval_censored(low=a, high=b)
|
||||
>>> print(data)
|
||||
CensoredData(4 values: 0 not censored, 4 interval-censored)
|
||||
"""
|
||||
_validate_1d(low, 'low', allow_inf=True)
|
||||
_validate_1d(high, 'high', allow_inf=True)
|
||||
if len(low) != len(high):
|
||||
raise ValueError('`low` and `high` must have the same length.')
|
||||
interval = np.column_stack((low, high))
|
||||
uncensored, left, right, interval = _validate_interval(interval)
|
||||
return cls(uncensored=uncensored, left=left, right=right,
|
||||
interval=interval)
|
||||
|
||||
def _uncensor(self):
|
||||
"""
|
||||
This function is used when a non-censored version of the data
|
||||
is needed to create a rough estimate of the parameters of a
|
||||
distribution via the method of moments or some similar method.
|
||||
The data is "uncensored" by taking the given endpoints as the
|
||||
data for the left- or right-censored data, and the mean for the
|
||||
interval-censored data.
|
||||
"""
|
||||
data = np.concatenate((self._uncensored, self._left, self._right,
|
||||
self._interval.mean(axis=1)))
|
||||
return data
|
||||
|
||||
def _supported(self, a, b):
|
||||
"""
|
||||
Return a subset of self containing the values that are in
|
||||
(or overlap with) the interval (a, b).
|
||||
"""
|
||||
uncensored = self._uncensored
|
||||
uncensored = uncensored[(a < uncensored) & (uncensored < b)]
|
||||
left = self._left
|
||||
left = left[a < left]
|
||||
right = self._right
|
||||
right = right[right < b]
|
||||
interval = self._interval
|
||||
interval = interval[(a < interval[:, 1]) & (interval[:, 0] < b)]
|
||||
return CensoredData(uncensored, left=left, right=right,
|
||||
interval=interval)
|
||||
5
venv/lib/python3.12/site-packages/scipy/stats/_common.py
Normal file
5
venv/lib/python3.12/site-packages/scipy/stats/_common.py
Normal file
@ -0,0 +1,5 @@
|
||||
from collections import namedtuple
|
||||
|
||||
|
||||
ConfidenceInterval = namedtuple("ConfidenceInterval", ["low", "high"])
|
||||
ConfidenceInterval. __doc__ = "Class for confidence intervals."
|
||||
39
venv/lib/python3.12/site-packages/scipy/stats/_constants.py
Normal file
39
venv/lib/python3.12/site-packages/scipy/stats/_constants.py
Normal file
@ -0,0 +1,39 @@
|
||||
"""
|
||||
Statistics-related constants.
|
||||
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
|
||||
# The smallest representable positive number such that 1.0 + _EPS != 1.0.
|
||||
_EPS = np.finfo(float).eps
|
||||
|
||||
# The largest [in magnitude] usable floating value.
|
||||
_XMAX = np.finfo(float).max
|
||||
|
||||
# The log of the largest usable floating value; useful for knowing
|
||||
# when exp(something) will overflow
|
||||
_LOGXMAX = np.log(_XMAX)
|
||||
|
||||
# The smallest [in magnitude] usable (i.e. not subnormal) double precision
|
||||
# floating value.
|
||||
_XMIN = np.finfo(float).tiny
|
||||
|
||||
# The log of the smallest [in magnitude] usable (i.e not subnormal)
|
||||
# double precision floating value.
|
||||
_LOGXMIN = np.log(_XMIN)
|
||||
|
||||
# -special.psi(1)
|
||||
_EULER = 0.577215664901532860606512090082402431042
|
||||
|
||||
# special.zeta(3, 1) Apery's constant
|
||||
_ZETA3 = 1.202056903159594285399738161511449990765
|
||||
|
||||
# sqrt(pi)
|
||||
_SQRT_PI = 1.772453850905516027298167483341145182798
|
||||
|
||||
# sqrt(2/pi)
|
||||
_SQRT_2_OVER_PI = 0.7978845608028654
|
||||
|
||||
# log(sqrt(2/pi))
|
||||
_LOG_SQRT_2_OVER_PI = -0.22579135264472744
|
||||
12173
venv/lib/python3.12/site-packages/scipy/stats/_continuous_distns.py
Normal file
12173
venv/lib/python3.12/site-packages/scipy/stats/_continuous_distns.py
Normal file
File diff suppressed because it is too large
Load Diff
633
venv/lib/python3.12/site-packages/scipy/stats/_covariance.py
Normal file
633
venv/lib/python3.12/site-packages/scipy/stats/_covariance.py
Normal file
@ -0,0 +1,633 @@
|
||||
from functools import cached_property
|
||||
|
||||
import numpy as np
|
||||
from scipy import linalg
|
||||
from scipy.stats import _multivariate
|
||||
|
||||
|
||||
__all__ = ["Covariance"]
|
||||
|
||||
|
||||
class Covariance:
|
||||
"""
|
||||
Representation of a covariance matrix
|
||||
|
||||
Calculations involving covariance matrices (e.g. data whitening,
|
||||
multivariate normal function evaluation) are often performed more
|
||||
efficiently using a decomposition of the covariance matrix instead of the
|
||||
covariance matrix itself. This class allows the user to construct an
|
||||
object representing a covariance matrix using any of several
|
||||
decompositions and perform calculations using a common interface.
|
||||
|
||||
.. note::
|
||||
|
||||
The `Covariance` class cannot be instantiated directly. Instead, use
|
||||
one of the factory methods (e.g. `Covariance.from_diagonal`).
|
||||
|
||||
Examples
|
||||
--------
|
||||
The `Covariance` class is used by calling one of its
|
||||
factory methods to create a `Covariance` object, then pass that
|
||||
representation of the `Covariance` matrix as a shape parameter of a
|
||||
multivariate distribution.
|
||||
|
||||
For instance, the multivariate normal distribution can accept an array
|
||||
representing a covariance matrix:
|
||||
|
||||
>>> from scipy import stats
|
||||
>>> import numpy as np
|
||||
>>> d = [1, 2, 3]
|
||||
>>> A = np.diag(d) # a diagonal covariance matrix
|
||||
>>> x = [4, -2, 5] # a point of interest
|
||||
>>> dist = stats.multivariate_normal(mean=[0, 0, 0], cov=A)
|
||||
>>> dist.pdf(x)
|
||||
4.9595685102808205e-08
|
||||
|
||||
but the calculations are performed in a very generic way that does not
|
||||
take advantage of any special properties of the covariance matrix. Because
|
||||
our covariance matrix is diagonal, we can use ``Covariance.from_diagonal``
|
||||
to create an object representing the covariance matrix, and
|
||||
`multivariate_normal` can use this to compute the probability density
|
||||
function more efficiently.
|
||||
|
||||
>>> cov = stats.Covariance.from_diagonal(d)
|
||||
>>> dist = stats.multivariate_normal(mean=[0, 0, 0], cov=cov)
|
||||
>>> dist.pdf(x)
|
||||
4.9595685102808205e-08
|
||||
|
||||
"""
|
||||
def __init__(self):
|
||||
message = ("The `Covariance` class cannot be instantiated directly. "
|
||||
"Please use one of the factory methods "
|
||||
"(e.g. `Covariance.from_diagonal`).")
|
||||
raise NotImplementedError(message)
|
||||
|
||||
@staticmethod
|
||||
def from_diagonal(diagonal):
|
||||
r"""
|
||||
Return a representation of a covariance matrix from its diagonal.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
diagonal : array_like
|
||||
The diagonal elements of a diagonal matrix.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Let the diagonal elements of a diagonal covariance matrix :math:`D` be
|
||||
stored in the vector :math:`d`.
|
||||
|
||||
When all elements of :math:`d` are strictly positive, whitening of a
|
||||
data point :math:`x` is performed by computing
|
||||
:math:`x \cdot d^{-1/2}`, where the inverse square root can be taken
|
||||
element-wise.
|
||||
:math:`\log\det{D}` is calculated as :math:`-2 \sum(\log{d})`,
|
||||
where the :math:`\log` operation is performed element-wise.
|
||||
|
||||
This `Covariance` class supports singular covariance matrices. When
|
||||
computing ``_log_pdet``, non-positive elements of :math:`d` are
|
||||
ignored. Whitening is not well defined when the point to be whitened
|
||||
does not lie in the span of the columns of the covariance matrix. The
|
||||
convention taken here is to treat the inverse square root of
|
||||
non-positive elements of :math:`d` as zeros.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Prepare a symmetric positive definite covariance matrix ``A`` and a
|
||||
data point ``x``.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> n = 5
|
||||
>>> A = np.diag(rng.random(n))
|
||||
>>> x = rng.random(size=n)
|
||||
|
||||
Extract the diagonal from ``A`` and create the `Covariance` object.
|
||||
|
||||
>>> d = np.diag(A)
|
||||
>>> cov = stats.Covariance.from_diagonal(d)
|
||||
|
||||
Compare the functionality of the `Covariance` object against a
|
||||
reference implementations.
|
||||
|
||||
>>> res = cov.whiten(x)
|
||||
>>> ref = np.diag(d**-0.5) @ x
|
||||
>>> np.allclose(res, ref)
|
||||
True
|
||||
>>> res = cov.log_pdet
|
||||
>>> ref = np.linalg.slogdet(A)[-1]
|
||||
>>> np.allclose(res, ref)
|
||||
True
|
||||
|
||||
"""
|
||||
return CovViaDiagonal(diagonal)
|
||||
|
||||
@staticmethod
|
||||
def from_precision(precision, covariance=None):
|
||||
r"""
|
||||
Return a representation of a covariance from its precision matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
precision : array_like
|
||||
The precision matrix; that is, the inverse of a square, symmetric,
|
||||
positive definite covariance matrix.
|
||||
covariance : array_like, optional
|
||||
The square, symmetric, positive definite covariance matrix. If not
|
||||
provided, this may need to be calculated (e.g. to evaluate the
|
||||
cumulative distribution function of
|
||||
`scipy.stats.multivariate_normal`) by inverting `precision`.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Let the covariance matrix be :math:`A`, its precision matrix be
|
||||
:math:`P = A^{-1}`, and :math:`L` be the lower Cholesky factor such
|
||||
that :math:`L L^T = P`.
|
||||
Whitening of a data point :math:`x` is performed by computing
|
||||
:math:`x^T L`. :math:`\log\det{A}` is calculated as
|
||||
:math:`-2tr(\log{L})`, where the :math:`\log` operation is performed
|
||||
element-wise.
|
||||
|
||||
This `Covariance` class does not support singular covariance matrices
|
||||
because the precision matrix does not exist for a singular covariance
|
||||
matrix.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Prepare a symmetric positive definite precision matrix ``P`` and a
|
||||
data point ``x``. (If the precision matrix is not already available,
|
||||
consider the other factory methods of the `Covariance` class.)
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> n = 5
|
||||
>>> P = rng.random(size=(n, n))
|
||||
>>> P = P @ P.T # a precision matrix must be positive definite
|
||||
>>> x = rng.random(size=n)
|
||||
|
||||
Create the `Covariance` object.
|
||||
|
||||
>>> cov = stats.Covariance.from_precision(P)
|
||||
|
||||
Compare the functionality of the `Covariance` object against
|
||||
reference implementations.
|
||||
|
||||
>>> res = cov.whiten(x)
|
||||
>>> ref = x @ np.linalg.cholesky(P)
|
||||
>>> np.allclose(res, ref)
|
||||
True
|
||||
>>> res = cov.log_pdet
|
||||
>>> ref = -np.linalg.slogdet(P)[-1]
|
||||
>>> np.allclose(res, ref)
|
||||
True
|
||||
|
||||
"""
|
||||
return CovViaPrecision(precision, covariance)
|
||||
|
||||
@staticmethod
|
||||
def from_cholesky(cholesky):
|
||||
r"""
|
||||
Representation of a covariance provided via the (lower) Cholesky factor
|
||||
|
||||
Parameters
|
||||
----------
|
||||
cholesky : array_like
|
||||
The lower triangular Cholesky factor of the covariance matrix.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Let the covariance matrix be :math:`A` and :math:`L` be the lower
|
||||
Cholesky factor such that :math:`L L^T = A`.
|
||||
Whitening of a data point :math:`x` is performed by computing
|
||||
:math:`L^{-1} x`. :math:`\log\det{A}` is calculated as
|
||||
:math:`2tr(\log{L})`, where the :math:`\log` operation is performed
|
||||
element-wise.
|
||||
|
||||
This `Covariance` class does not support singular covariance matrices
|
||||
because the Cholesky decomposition does not exist for a singular
|
||||
covariance matrix.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Prepare a symmetric positive definite covariance matrix ``A`` and a
|
||||
data point ``x``.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> n = 5
|
||||
>>> A = rng.random(size=(n, n))
|
||||
>>> A = A @ A.T # make the covariance symmetric positive definite
|
||||
>>> x = rng.random(size=n)
|
||||
|
||||
Perform the Cholesky decomposition of ``A`` and create the
|
||||
`Covariance` object.
|
||||
|
||||
>>> L = np.linalg.cholesky(A)
|
||||
>>> cov = stats.Covariance.from_cholesky(L)
|
||||
|
||||
Compare the functionality of the `Covariance` object against
|
||||
reference implementation.
|
||||
|
||||
>>> from scipy.linalg import solve_triangular
|
||||
>>> res = cov.whiten(x)
|
||||
>>> ref = solve_triangular(L, x, lower=True)
|
||||
>>> np.allclose(res, ref)
|
||||
True
|
||||
>>> res = cov.log_pdet
|
||||
>>> ref = np.linalg.slogdet(A)[-1]
|
||||
>>> np.allclose(res, ref)
|
||||
True
|
||||
|
||||
"""
|
||||
return CovViaCholesky(cholesky)
|
||||
|
||||
@staticmethod
|
||||
def from_eigendecomposition(eigendecomposition):
|
||||
r"""
|
||||
Representation of a covariance provided via eigendecomposition
|
||||
|
||||
Parameters
|
||||
----------
|
||||
eigendecomposition : sequence
|
||||
A sequence (nominally a tuple) containing the eigenvalue and
|
||||
eigenvector arrays as computed by `scipy.linalg.eigh` or
|
||||
`numpy.linalg.eigh`.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Let the covariance matrix be :math:`A`, let :math:`V` be matrix of
|
||||
eigenvectors, and let :math:`W` be the diagonal matrix of eigenvalues
|
||||
such that `V W V^T = A`.
|
||||
|
||||
When all of the eigenvalues are strictly positive, whitening of a
|
||||
data point :math:`x` is performed by computing
|
||||
:math:`x^T (V W^{-1/2})`, where the inverse square root can be taken
|
||||
element-wise.
|
||||
:math:`\log\det{A}` is calculated as :math:`tr(\log{W})`,
|
||||
where the :math:`\log` operation is performed element-wise.
|
||||
|
||||
This `Covariance` class supports singular covariance matrices. When
|
||||
computing ``_log_pdet``, non-positive eigenvalues are ignored.
|
||||
Whitening is not well defined when the point to be whitened
|
||||
does not lie in the span of the columns of the covariance matrix. The
|
||||
convention taken here is to treat the inverse square root of
|
||||
non-positive eigenvalues as zeros.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Prepare a symmetric positive definite covariance matrix ``A`` and a
|
||||
data point ``x``.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> n = 5
|
||||
>>> A = rng.random(size=(n, n))
|
||||
>>> A = A @ A.T # make the covariance symmetric positive definite
|
||||
>>> x = rng.random(size=n)
|
||||
|
||||
Perform the eigendecomposition of ``A`` and create the `Covariance`
|
||||
object.
|
||||
|
||||
>>> w, v = np.linalg.eigh(A)
|
||||
>>> cov = stats.Covariance.from_eigendecomposition((w, v))
|
||||
|
||||
Compare the functionality of the `Covariance` object against
|
||||
reference implementations.
|
||||
|
||||
>>> res = cov.whiten(x)
|
||||
>>> ref = x @ (v @ np.diag(w**-0.5))
|
||||
>>> np.allclose(res, ref)
|
||||
True
|
||||
>>> res = cov.log_pdet
|
||||
>>> ref = np.linalg.slogdet(A)[-1]
|
||||
>>> np.allclose(res, ref)
|
||||
True
|
||||
|
||||
"""
|
||||
return CovViaEigendecomposition(eigendecomposition)
|
||||
|
||||
def whiten(self, x):
|
||||
"""
|
||||
Perform a whitening transformation on data.
|
||||
|
||||
"Whitening" ("white" as in "white noise", in which each frequency has
|
||||
equal magnitude) transforms a set of random variables into a new set of
|
||||
random variables with unit-diagonal covariance. When a whitening
|
||||
transform is applied to a sample of points distributed according to
|
||||
a multivariate normal distribution with zero mean, the covariance of
|
||||
the transformed sample is approximately the identity matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : array_like
|
||||
An array of points. The last dimension must correspond with the
|
||||
dimensionality of the space, i.e., the number of columns in the
|
||||
covariance matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
x_ : array_like
|
||||
The transformed array of points.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] "Whitening Transformation". Wikipedia.
|
||||
https://en.wikipedia.org/wiki/Whitening_transformation
|
||||
.. [2] Novak, Lukas, and Miroslav Vorechovsky. "Generalization of
|
||||
coloring linear transformation". Transactions of VSB 18.2
|
||||
(2018): 31-35. :doi:`10.31490/tces-2018-0013`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> n = 3
|
||||
>>> A = rng.random(size=(n, n))
|
||||
>>> cov_array = A @ A.T # make matrix symmetric positive definite
|
||||
>>> precision = np.linalg.inv(cov_array)
|
||||
>>> cov_object = stats.Covariance.from_precision(precision)
|
||||
>>> x = rng.multivariate_normal(np.zeros(n), cov_array, size=(10000))
|
||||
>>> x_ = cov_object.whiten(x)
|
||||
>>> np.cov(x_, rowvar=False) # near-identity covariance
|
||||
array([[0.97862122, 0.00893147, 0.02430451],
|
||||
[0.00893147, 0.96719062, 0.02201312],
|
||||
[0.02430451, 0.02201312, 0.99206881]])
|
||||
|
||||
"""
|
||||
return self._whiten(np.asarray(x))
|
||||
|
||||
def colorize(self, x):
|
||||
"""
|
||||
Perform a colorizing transformation on data.
|
||||
|
||||
"Colorizing" ("color" as in "colored noise", in which different
|
||||
frequencies may have different magnitudes) transforms a set of
|
||||
uncorrelated random variables into a new set of random variables with
|
||||
the desired covariance. When a coloring transform is applied to a
|
||||
sample of points distributed according to a multivariate normal
|
||||
distribution with identity covariance and zero mean, the covariance of
|
||||
the transformed sample is approximately the covariance matrix used
|
||||
in the coloring transform.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : array_like
|
||||
An array of points. The last dimension must correspond with the
|
||||
dimensionality of the space, i.e., the number of columns in the
|
||||
covariance matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
x_ : array_like
|
||||
The transformed array of points.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] "Whitening Transformation". Wikipedia.
|
||||
https://en.wikipedia.org/wiki/Whitening_transformation
|
||||
.. [2] Novak, Lukas, and Miroslav Vorechovsky. "Generalization of
|
||||
coloring linear transformation". Transactions of VSB 18.2
|
||||
(2018): 31-35. :doi:`10.31490/tces-2018-0013`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> rng = np.random.default_rng(1638083107694713882823079058616272161)
|
||||
>>> n = 3
|
||||
>>> A = rng.random(size=(n, n))
|
||||
>>> cov_array = A @ A.T # make matrix symmetric positive definite
|
||||
>>> cholesky = np.linalg.cholesky(cov_array)
|
||||
>>> cov_object = stats.Covariance.from_cholesky(cholesky)
|
||||
>>> x = rng.multivariate_normal(np.zeros(n), np.eye(n), size=(10000))
|
||||
>>> x_ = cov_object.colorize(x)
|
||||
>>> cov_data = np.cov(x_, rowvar=False)
|
||||
>>> np.allclose(cov_data, cov_array, rtol=3e-2)
|
||||
True
|
||||
"""
|
||||
return self._colorize(np.asarray(x))
|
||||
|
||||
@property
|
||||
def log_pdet(self):
|
||||
"""
|
||||
Log of the pseudo-determinant of the covariance matrix
|
||||
"""
|
||||
return np.array(self._log_pdet, dtype=float)[()]
|
||||
|
||||
@property
|
||||
def rank(self):
|
||||
"""
|
||||
Rank of the covariance matrix
|
||||
"""
|
||||
return np.array(self._rank, dtype=int)[()]
|
||||
|
||||
@property
|
||||
def covariance(self):
|
||||
"""
|
||||
Explicit representation of the covariance matrix
|
||||
"""
|
||||
return self._covariance
|
||||
|
||||
@property
|
||||
def shape(self):
|
||||
"""
|
||||
Shape of the covariance array
|
||||
"""
|
||||
return self._shape
|
||||
|
||||
def _validate_matrix(self, A, name):
|
||||
A = np.atleast_2d(A)
|
||||
m, n = A.shape[-2:]
|
||||
if m != n or A.ndim != 2 or not (np.issubdtype(A.dtype, np.integer) or
|
||||
np.issubdtype(A.dtype, np.floating)):
|
||||
message = (f"The input `{name}` must be a square, "
|
||||
"two-dimensional array of real numbers.")
|
||||
raise ValueError(message)
|
||||
return A
|
||||
|
||||
def _validate_vector(self, A, name):
|
||||
A = np.atleast_1d(A)
|
||||
if A.ndim != 1 or not (np.issubdtype(A.dtype, np.integer) or
|
||||
np.issubdtype(A.dtype, np.floating)):
|
||||
message = (f"The input `{name}` must be a one-dimensional array "
|
||||
"of real numbers.")
|
||||
raise ValueError(message)
|
||||
return A
|
||||
|
||||
|
||||
class CovViaPrecision(Covariance):
|
||||
|
||||
def __init__(self, precision, covariance=None):
|
||||
precision = self._validate_matrix(precision, 'precision')
|
||||
if covariance is not None:
|
||||
covariance = self._validate_matrix(covariance, 'covariance')
|
||||
message = "`precision.shape` must equal `covariance.shape`."
|
||||
if precision.shape != covariance.shape:
|
||||
raise ValueError(message)
|
||||
|
||||
self._chol_P = np.linalg.cholesky(precision)
|
||||
self._log_pdet = -2*np.log(np.diag(self._chol_P)).sum(axis=-1)
|
||||
self._rank = precision.shape[-1] # must be full rank if invertible
|
||||
self._precision = precision
|
||||
self._cov_matrix = covariance
|
||||
self._shape = precision.shape
|
||||
self._allow_singular = False
|
||||
|
||||
def _whiten(self, x):
|
||||
return x @ self._chol_P
|
||||
|
||||
@cached_property
|
||||
def _covariance(self):
|
||||
n = self._shape[-1]
|
||||
return (linalg.cho_solve((self._chol_P, True), np.eye(n))
|
||||
if self._cov_matrix is None else self._cov_matrix)
|
||||
|
||||
def _colorize(self, x):
|
||||
return linalg.solve_triangular(self._chol_P.T, x.T, lower=False).T
|
||||
|
||||
|
||||
def _dot_diag(x, d):
|
||||
# If d were a full diagonal matrix, x @ d would always do what we want.
|
||||
# Special treatment is needed for n-dimensional `d` in which each row
|
||||
# includes only the diagonal elements of a covariance matrix.
|
||||
return x * d if x.ndim < 2 else x * np.expand_dims(d, -2)
|
||||
|
||||
|
||||
class CovViaDiagonal(Covariance):
|
||||
|
||||
def __init__(self, diagonal):
|
||||
diagonal = self._validate_vector(diagonal, 'diagonal')
|
||||
|
||||
i_zero = diagonal <= 0
|
||||
positive_diagonal = np.array(diagonal, dtype=np.float64)
|
||||
|
||||
positive_diagonal[i_zero] = 1 # ones don't affect determinant
|
||||
self._log_pdet = np.sum(np.log(positive_diagonal), axis=-1)
|
||||
|
||||
psuedo_reciprocals = 1 / np.sqrt(positive_diagonal)
|
||||
psuedo_reciprocals[i_zero] = 0
|
||||
|
||||
self._sqrt_diagonal = np.sqrt(diagonal)
|
||||
self._LP = psuedo_reciprocals
|
||||
self._rank = positive_diagonal.shape[-1] - i_zero.sum(axis=-1)
|
||||
self._covariance = np.apply_along_axis(np.diag, -1, diagonal)
|
||||
self._i_zero = i_zero
|
||||
self._shape = self._covariance.shape
|
||||
self._allow_singular = True
|
||||
|
||||
def _whiten(self, x):
|
||||
return _dot_diag(x, self._LP)
|
||||
|
||||
def _colorize(self, x):
|
||||
return _dot_diag(x, self._sqrt_diagonal)
|
||||
|
||||
def _support_mask(self, x):
|
||||
"""
|
||||
Check whether x lies in the support of the distribution.
|
||||
"""
|
||||
return ~np.any(_dot_diag(x, self._i_zero), axis=-1)
|
||||
|
||||
|
||||
class CovViaCholesky(Covariance):
|
||||
|
||||
def __init__(self, cholesky):
|
||||
L = self._validate_matrix(cholesky, 'cholesky')
|
||||
|
||||
self._factor = L
|
||||
self._log_pdet = 2*np.log(np.diag(self._factor)).sum(axis=-1)
|
||||
self._rank = L.shape[-1] # must be full rank for cholesky
|
||||
self._shape = L.shape
|
||||
self._allow_singular = False
|
||||
|
||||
@cached_property
|
||||
def _covariance(self):
|
||||
return self._factor @ self._factor.T
|
||||
|
||||
def _whiten(self, x):
|
||||
res = linalg.solve_triangular(self._factor, x.T, lower=True).T
|
||||
return res
|
||||
|
||||
def _colorize(self, x):
|
||||
return x @ self._factor.T
|
||||
|
||||
|
||||
class CovViaEigendecomposition(Covariance):
|
||||
|
||||
def __init__(self, eigendecomposition):
|
||||
eigenvalues, eigenvectors = eigendecomposition
|
||||
eigenvalues = self._validate_vector(eigenvalues, 'eigenvalues')
|
||||
eigenvectors = self._validate_matrix(eigenvectors, 'eigenvectors')
|
||||
message = ("The shapes of `eigenvalues` and `eigenvectors` "
|
||||
"must be compatible.")
|
||||
try:
|
||||
eigenvalues = np.expand_dims(eigenvalues, -2)
|
||||
eigenvectors, eigenvalues = np.broadcast_arrays(eigenvectors,
|
||||
eigenvalues)
|
||||
eigenvalues = eigenvalues[..., 0, :]
|
||||
except ValueError:
|
||||
raise ValueError(message)
|
||||
|
||||
i_zero = eigenvalues <= 0
|
||||
positive_eigenvalues = np.array(eigenvalues, dtype=np.float64)
|
||||
|
||||
positive_eigenvalues[i_zero] = 1 # ones don't affect determinant
|
||||
self._log_pdet = np.sum(np.log(positive_eigenvalues), axis=-1)
|
||||
|
||||
psuedo_reciprocals = 1 / np.sqrt(positive_eigenvalues)
|
||||
psuedo_reciprocals[i_zero] = 0
|
||||
|
||||
self._LP = eigenvectors * psuedo_reciprocals
|
||||
self._LA = eigenvectors * np.sqrt(eigenvalues)
|
||||
self._rank = positive_eigenvalues.shape[-1] - i_zero.sum(axis=-1)
|
||||
self._w = eigenvalues
|
||||
self._v = eigenvectors
|
||||
self._shape = eigenvectors.shape
|
||||
self._null_basis = eigenvectors * i_zero
|
||||
# This is only used for `_support_mask`, not to decide whether
|
||||
# the covariance is singular or not.
|
||||
self._eps = _multivariate._eigvalsh_to_eps(eigenvalues) * 10**3
|
||||
self._allow_singular = True
|
||||
|
||||
def _whiten(self, x):
|
||||
return x @ self._LP
|
||||
|
||||
def _colorize(self, x):
|
||||
return x @ self._LA.T
|
||||
|
||||
@cached_property
|
||||
def _covariance(self):
|
||||
return (self._v * self._w) @ self._v.T
|
||||
|
||||
def _support_mask(self, x):
|
||||
"""
|
||||
Check whether x lies in the support of the distribution.
|
||||
"""
|
||||
residual = np.linalg.norm(x @ self._null_basis, axis=-1)
|
||||
in_support = residual < self._eps
|
||||
return in_support
|
||||
|
||||
|
||||
class CovViaPSD(Covariance):
|
||||
"""
|
||||
Representation of a covariance provided via an instance of _PSD
|
||||
"""
|
||||
|
||||
def __init__(self, psd):
|
||||
self._LP = psd.U
|
||||
self._log_pdet = psd.log_pdet
|
||||
self._rank = psd.rank
|
||||
self._covariance = psd._M
|
||||
self._shape = psd._M.shape
|
||||
self._psd = psd
|
||||
self._allow_singular = False # by default
|
||||
|
||||
def _whiten(self, x):
|
||||
return x @ self._LP
|
||||
|
||||
def _support_mask(self, x):
|
||||
return self._psd._support_mask(x)
|
||||
204
venv/lib/python3.12/site-packages/scipy/stats/_crosstab.py
Normal file
204
venv/lib/python3.12/site-packages/scipy/stats/_crosstab.py
Normal file
@ -0,0 +1,204 @@
|
||||
import numpy as np
|
||||
from scipy.sparse import coo_matrix
|
||||
from scipy._lib._bunch import _make_tuple_bunch
|
||||
|
||||
|
||||
CrosstabResult = _make_tuple_bunch(
|
||||
"CrosstabResult", ["elements", "count"]
|
||||
)
|
||||
|
||||
|
||||
def crosstab(*args, levels=None, sparse=False):
|
||||
"""
|
||||
Return table of counts for each possible unique combination in ``*args``.
|
||||
|
||||
When ``len(args) > 1``, the array computed by this function is
|
||||
often referred to as a *contingency table* [1]_.
|
||||
|
||||
The arguments must be sequences with the same length. The second return
|
||||
value, `count`, is an integer array with ``len(args)`` dimensions. If
|
||||
`levels` is None, the shape of `count` is ``(n0, n1, ...)``, where ``nk``
|
||||
is the number of unique elements in ``args[k]``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
*args : sequences
|
||||
A sequence of sequences whose unique aligned elements are to be
|
||||
counted. The sequences in args must all be the same length.
|
||||
levels : sequence, optional
|
||||
If `levels` is given, it must be a sequence that is the same length as
|
||||
`args`. Each element in `levels` is either a sequence or None. If it
|
||||
is a sequence, it gives the values in the corresponding sequence in
|
||||
`args` that are to be counted. If any value in the sequences in `args`
|
||||
does not occur in the corresponding sequence in `levels`, that value
|
||||
is ignored and not counted in the returned array `count`. The default
|
||||
value of `levels` for ``args[i]`` is ``np.unique(args[i])``
|
||||
sparse : bool, optional
|
||||
If True, return a sparse matrix. The matrix will be an instance of
|
||||
the `scipy.sparse.coo_matrix` class. Because SciPy's sparse matrices
|
||||
must be 2-d, only two input sequences are allowed when `sparse` is
|
||||
True. Default is False.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : CrosstabResult
|
||||
An object containing the following attributes:
|
||||
|
||||
elements : tuple of numpy.ndarrays.
|
||||
Tuple of length ``len(args)`` containing the arrays of elements
|
||||
that are counted in `count`. These can be interpreted as the
|
||||
labels of the corresponding dimensions of `count`. If `levels` was
|
||||
given, then if ``levels[i]`` is not None, ``elements[i]`` will
|
||||
hold the values given in ``levels[i]``.
|
||||
count : numpy.ndarray or scipy.sparse.coo_matrix
|
||||
Counts of the unique elements in ``zip(*args)``, stored in an
|
||||
array. Also known as a *contingency table* when ``len(args) > 1``.
|
||||
|
||||
See Also
|
||||
--------
|
||||
numpy.unique
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. versionadded:: 1.7.0
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] "Contingency table", http://en.wikipedia.org/wiki/Contingency_table
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.stats.contingency import crosstab
|
||||
|
||||
Given the lists `a` and `x`, create a contingency table that counts the
|
||||
frequencies of the corresponding pairs.
|
||||
|
||||
>>> a = ['A', 'B', 'A', 'A', 'B', 'B', 'A', 'A', 'B', 'B']
|
||||
>>> x = ['X', 'X', 'X', 'Y', 'Z', 'Z', 'Y', 'Y', 'Z', 'Z']
|
||||
>>> res = crosstab(a, x)
|
||||
>>> avals, xvals = res.elements
|
||||
>>> avals
|
||||
array(['A', 'B'], dtype='<U1')
|
||||
>>> xvals
|
||||
array(['X', 'Y', 'Z'], dtype='<U1')
|
||||
>>> res.count
|
||||
array([[2, 3, 0],
|
||||
[1, 0, 4]])
|
||||
|
||||
So `('A', 'X')` occurs twice, `('A', 'Y')` occurs three times, etc.
|
||||
|
||||
Higher dimensional contingency tables can be created.
|
||||
|
||||
>>> p = [0, 0, 0, 0, 1, 1, 1, 0, 0, 1]
|
||||
>>> res = crosstab(a, x, p)
|
||||
>>> res.count
|
||||
array([[[2, 0],
|
||||
[2, 1],
|
||||
[0, 0]],
|
||||
[[1, 0],
|
||||
[0, 0],
|
||||
[1, 3]]])
|
||||
>>> res.count.shape
|
||||
(2, 3, 2)
|
||||
|
||||
The values to be counted can be set by using the `levels` argument.
|
||||
It allows the elements of interest in each input sequence to be
|
||||
given explicitly instead finding the unique elements of the sequence.
|
||||
|
||||
For example, suppose one of the arguments is an array containing the
|
||||
answers to a survey question, with integer values 1 to 4. Even if the
|
||||
value 1 does not occur in the data, we want an entry for it in the table.
|
||||
|
||||
>>> q1 = [2, 3, 3, 2, 4, 4, 2, 3, 4, 4, 4, 3, 3, 3, 4] # 1 does not occur.
|
||||
>>> q2 = [4, 4, 2, 2, 2, 4, 1, 1, 2, 2, 4, 2, 2, 2, 4] # 3 does not occur.
|
||||
>>> options = [1, 2, 3, 4]
|
||||
>>> res = crosstab(q1, q2, levels=(options, options))
|
||||
>>> res.count
|
||||
array([[0, 0, 0, 0],
|
||||
[1, 1, 0, 1],
|
||||
[1, 4, 0, 1],
|
||||
[0, 3, 0, 3]])
|
||||
|
||||
If `levels` is given, but an element of `levels` is None, the unique values
|
||||
of the corresponding argument are used. For example,
|
||||
|
||||
>>> res = crosstab(q1, q2, levels=(None, options))
|
||||
>>> res.elements
|
||||
[array([2, 3, 4]), [1, 2, 3, 4]]
|
||||
>>> res.count
|
||||
array([[1, 1, 0, 1],
|
||||
[1, 4, 0, 1],
|
||||
[0, 3, 0, 3]])
|
||||
|
||||
If we want to ignore the pairs where 4 occurs in ``q2``, we can
|
||||
give just the values [1, 2] to `levels`, and the 4 will be ignored:
|
||||
|
||||
>>> res = crosstab(q1, q2, levels=(None, [1, 2]))
|
||||
>>> res.elements
|
||||
[array([2, 3, 4]), [1, 2]]
|
||||
>>> res.count
|
||||
array([[1, 1],
|
||||
[1, 4],
|
||||
[0, 3]])
|
||||
|
||||
Finally, let's repeat the first example, but return a sparse matrix:
|
||||
|
||||
>>> res = crosstab(a, x, sparse=True)
|
||||
>>> res.count
|
||||
<COOrdinate sparse matrix of dtype 'int64'
|
||||
with 4 stored elements and shape (2, 3)>
|
||||
>>> res.count.toarray()
|
||||
array([[2, 3, 0],
|
||||
[1, 0, 4]])
|
||||
|
||||
"""
|
||||
nargs = len(args)
|
||||
if nargs == 0:
|
||||
raise TypeError("At least one input sequence is required.")
|
||||
|
||||
len0 = len(args[0])
|
||||
if not all(len(a) == len0 for a in args[1:]):
|
||||
raise ValueError("All input sequences must have the same length.")
|
||||
|
||||
if sparse and nargs != 2:
|
||||
raise ValueError("When `sparse` is True, only two input sequences "
|
||||
"are allowed.")
|
||||
|
||||
if levels is None:
|
||||
# Call np.unique with return_inverse=True on each argument.
|
||||
actual_levels, indices = zip(*[np.unique(a, return_inverse=True)
|
||||
for a in args])
|
||||
else:
|
||||
# `levels` is not None...
|
||||
if len(levels) != nargs:
|
||||
raise ValueError('len(levels) must equal the number of input '
|
||||
'sequences')
|
||||
|
||||
args = [np.asarray(arg) for arg in args]
|
||||
mask = np.zeros((nargs, len0), dtype=np.bool_)
|
||||
inv = np.zeros((nargs, len0), dtype=np.intp)
|
||||
actual_levels = []
|
||||
for k, (levels_list, arg) in enumerate(zip(levels, args)):
|
||||
if levels_list is None:
|
||||
levels_list, inv[k, :] = np.unique(arg, return_inverse=True)
|
||||
mask[k, :] = True
|
||||
else:
|
||||
q = arg == np.asarray(levels_list).reshape(-1, 1)
|
||||
mask[k, :] = np.any(q, axis=0)
|
||||
qnz = q.T.nonzero()
|
||||
inv[k, qnz[0]] = qnz[1]
|
||||
actual_levels.append(levels_list)
|
||||
|
||||
mask_all = mask.all(axis=0)
|
||||
indices = tuple(inv[:, mask_all])
|
||||
|
||||
if sparse:
|
||||
count = coo_matrix((np.ones(len(indices[0]), dtype=int),
|
||||
(indices[0], indices[1])))
|
||||
count.sum_duplicates()
|
||||
else:
|
||||
shape = [len(u) for u in actual_levels]
|
||||
count = np.zeros(shape, dtype=int)
|
||||
np.add.at(count, indices, 1)
|
||||
|
||||
return CrosstabResult(actual_levels, count)
|
||||
1922
venv/lib/python3.12/site-packages/scipy/stats/_discrete_distns.py
Normal file
1922
venv/lib/python3.12/site-packages/scipy/stats/_discrete_distns.py
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
292
venv/lib/python3.12/site-packages/scipy/stats/_distr_params.py
Normal file
292
venv/lib/python3.12/site-packages/scipy/stats/_distr_params.py
Normal file
@ -0,0 +1,292 @@
|
||||
"""
|
||||
Sane parameters for stats.distributions.
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
distcont = [
|
||||
['alpha', (3.5704770516650459,)],
|
||||
['anglit', ()],
|
||||
['arcsine', ()],
|
||||
['argus', (1.0,)],
|
||||
['beta', (2.3098496451481823, 0.62687954300963677)],
|
||||
['betaprime', (5, 6)],
|
||||
['bradford', (0.29891359763170633,)],
|
||||
['burr', (10.5, 4.3)],
|
||||
['burr12', (10, 4)],
|
||||
['cauchy', ()],
|
||||
['chi', (78,)],
|
||||
['chi2', (55,)],
|
||||
['cosine', ()],
|
||||
['crystalball', (2.0, 3.0)],
|
||||
['dgamma', (1.1023326088288166,)],
|
||||
['dweibull', (2.0685080649914673,)],
|
||||
['erlang', (10,)],
|
||||
['expon', ()],
|
||||
['exponnorm', (1.5,)],
|
||||
['exponpow', (2.697119160358469,)],
|
||||
['exponweib', (2.8923945291034436, 1.9505288745913174)],
|
||||
['f', (29, 18)],
|
||||
['fatiguelife', (29,)], # correction numargs = 1
|
||||
['fisk', (3.0857548622253179,)],
|
||||
['foldcauchy', (4.7164673455831894,)],
|
||||
['foldnorm', (1.9521253373555869,)],
|
||||
['gamma', (1.9932305483800778,)],
|
||||
['gausshyper', (13.763771604130699, 3.1189636648681431,
|
||||
2.5145980350183019, 5.1811649903971615)], # veryslow
|
||||
['genexpon', (9.1325976465418908, 16.231956600590632, 3.2819552690843983)],
|
||||
['genextreme', (-0.1,)],
|
||||
['gengamma', (4.4162385429431925, 3.1193091679242761)],
|
||||
['gengamma', (4.4162385429431925, -3.1193091679242761)],
|
||||
['genhalflogistic', (0.77274727809929322,)],
|
||||
['genhyperbolic', (0.5, 1.5, -0.5,)],
|
||||
['geninvgauss', (2.3, 1.5)],
|
||||
['genlogistic', (0.41192440799679475,)],
|
||||
['gennorm', (1.2988442399460265,)],
|
||||
['halfgennorm', (0.6748054997000371,)],
|
||||
['genpareto', (0.1,)], # use case with finite moments
|
||||
['gibrat', ()],
|
||||
['gompertz', (0.94743713075105251,)],
|
||||
['gumbel_l', ()],
|
||||
['gumbel_r', ()],
|
||||
['halfcauchy', ()],
|
||||
['halflogistic', ()],
|
||||
['halfnorm', ()],
|
||||
['hypsecant', ()],
|
||||
['invgamma', (4.0668996136993067,)],
|
||||
['invgauss', (0.14546264555347513,)],
|
||||
['invweibull', (10.58,)],
|
||||
['irwinhall', (10,)],
|
||||
['jf_skew_t', (8, 4)],
|
||||
['johnsonsb', (4.3172675099141058, 3.1837781130785063)],
|
||||
['johnsonsu', (2.554395574161155, 2.2482281679651965)],
|
||||
['kappa4', (0.0, 0.0)],
|
||||
['kappa4', (-0.1, 0.1)],
|
||||
['kappa4', (0.0, 0.1)],
|
||||
['kappa4', (0.1, 0.0)],
|
||||
['kappa3', (1.0,)],
|
||||
['ksone', (1000,)], # replace 22 by 100 to avoid failing range, ticket 956
|
||||
['kstwo', (10,)],
|
||||
['kstwobign', ()],
|
||||
['laplace', ()],
|
||||
['laplace_asymmetric', (2,)],
|
||||
['levy', ()],
|
||||
['levy_l', ()],
|
||||
['levy_stable', (1.8, -0.5)],
|
||||
['loggamma', (0.41411931826052117,)],
|
||||
['logistic', ()],
|
||||
['loglaplace', (3.2505926592051435,)],
|
||||
['lognorm', (0.95368226960575331,)],
|
||||
['loguniform', (0.01, 1.25)],
|
||||
['lomax', (1.8771398388773268,)],
|
||||
['maxwell', ()],
|
||||
['mielke', (10.4, 4.6)],
|
||||
['moyal', ()],
|
||||
['nakagami', (4.9673794866666237,)],
|
||||
['ncf', (27, 27, 0.41578441799226107)],
|
||||
['nct', (14, 0.24045031331198066)],
|
||||
['ncx2', (21, 1.0560465975116415)],
|
||||
['norm', ()],
|
||||
['norminvgauss', (1.25, 0.5)],
|
||||
['pareto', (2.621716532144454,)],
|
||||
['pearson3', (0.1,)],
|
||||
['pearson3', (-2,)],
|
||||
['powerlaw', (1.6591133289905851,)],
|
||||
['powerlaw', (0.6591133289905851,)],
|
||||
['powerlognorm', (2.1413923530064087, 0.44639540782048337)],
|
||||
['powernorm', (4.4453652254590779,)],
|
||||
['rayleigh', ()],
|
||||
['rdist', (1.6,)],
|
||||
['recipinvgauss', (0.63004267809369119,)],
|
||||
['reciprocal', (0.01, 1.25)],
|
||||
['rel_breitwigner', (36.545206797050334, )],
|
||||
['rice', (0.7749725210111873,)],
|
||||
['semicircular', ()],
|
||||
['skewcauchy', (0.5,)],
|
||||
['skewnorm', (4.0,)],
|
||||
['studentized_range', (3.0, 10.0)],
|
||||
['t', (2.7433514990818093,)],
|
||||
['trapezoid', (0.2, 0.8)],
|
||||
['triang', (0.15785029824528218,)],
|
||||
['truncexpon', (4.6907725456810478,)],
|
||||
['truncnorm', (-1.0978730080013919, 2.7306754109031979)],
|
||||
['truncnorm', (0.1, 2.)],
|
||||
['truncpareto', (1.8, 5.3)],
|
||||
['truncpareto', (2, 5)],
|
||||
['truncweibull_min', (2.5, 0.25, 1.75)],
|
||||
['tukeylambda', (3.1321477856738267,)],
|
||||
['uniform', ()],
|
||||
['vonmises', (3.9939042581071398,)],
|
||||
['vonmises_line', (3.9939042581071398,)],
|
||||
['wald', ()],
|
||||
['weibull_max', (2.8687961709100187,)],
|
||||
['weibull_min', (1.7866166930421596,)],
|
||||
['wrapcauchy', (0.031071279018614728,)]]
|
||||
|
||||
|
||||
distdiscrete = [
|
||||
['bernoulli',(0.3,)],
|
||||
['betabinom', (5, 2.3, 0.63)],
|
||||
['betanbinom', (5, 9.3, 1)],
|
||||
['binom', (5, 0.4)],
|
||||
['boltzmann',(1.4, 19)],
|
||||
['dlaplace', (0.8,)], # 0.5
|
||||
['geom', (0.5,)],
|
||||
['hypergeom',(30, 12, 6)],
|
||||
['hypergeom',(21,3,12)], # numpy.random (3,18,12) numpy ticket:921
|
||||
['hypergeom',(21,18,11)], # numpy.random (18,3,11) numpy ticket:921
|
||||
['nchypergeom_fisher', (140, 80, 60, 0.5)],
|
||||
['nchypergeom_wallenius', (140, 80, 60, 0.5)],
|
||||
['logser', (0.6,)], # re-enabled, numpy ticket:921
|
||||
['nbinom', (0.4, 0.4)], # from tickets: 583
|
||||
['nbinom', (5, 0.5)],
|
||||
['planck', (0.51,)], # 4.1
|
||||
['poisson', (0.6,)],
|
||||
['randint', (7, 31)],
|
||||
['skellam', (15, 8)],
|
||||
['zipf', (6.6,)],
|
||||
['zipfian', (0.75, 15)],
|
||||
['zipfian', (1.25, 10)],
|
||||
['yulesimon', (11.0,)],
|
||||
['nhypergeom', (20, 7, 1)]
|
||||
]
|
||||
|
||||
|
||||
invdistdiscrete = [
|
||||
# In each of the following, at least one shape parameter is invalid
|
||||
['hypergeom', (3, 3, 4)],
|
||||
['nhypergeom', (5, 2, 8)],
|
||||
['nchypergeom_fisher', (3, 3, 4, 1)],
|
||||
['nchypergeom_wallenius', (3, 3, 4, 1)],
|
||||
['bernoulli', (1.5, )],
|
||||
['binom', (10, 1.5)],
|
||||
['betabinom', (10, -0.4, -0.5)],
|
||||
['betanbinom', (10, -0.4, -0.5)],
|
||||
['boltzmann', (-1, 4)],
|
||||
['dlaplace', (-0.5, )],
|
||||
['geom', (1.5, )],
|
||||
['logser', (1.5, )],
|
||||
['nbinom', (10, 1.5)],
|
||||
['planck', (-0.5, )],
|
||||
['poisson', (-0.5, )],
|
||||
['randint', (5, 2)],
|
||||
['skellam', (-5, -2)],
|
||||
['zipf', (-2, )],
|
||||
['yulesimon', (-2, )],
|
||||
['zipfian', (-0.75, 15)]
|
||||
]
|
||||
|
||||
|
||||
invdistcont = [
|
||||
# In each of the following, at least one shape parameter is invalid
|
||||
['alpha', (-1, )],
|
||||
['anglit', ()],
|
||||
['arcsine', ()],
|
||||
['argus', (-1, )],
|
||||
['beta', (-2, 2)],
|
||||
['betaprime', (-2, 2)],
|
||||
['bradford', (-1, )],
|
||||
['burr', (-1, 1)],
|
||||
['burr12', (-1, 1)],
|
||||
['cauchy', ()],
|
||||
['chi', (-1, )],
|
||||
['chi2', (-1, )],
|
||||
['cosine', ()],
|
||||
['crystalball', (-1, 2)],
|
||||
['dgamma', (-1, )],
|
||||
['dweibull', (-1, )],
|
||||
['erlang', (-1, )],
|
||||
['expon', ()],
|
||||
['exponnorm', (-1, )],
|
||||
['exponweib', (1, -1)],
|
||||
['exponpow', (-1, )],
|
||||
['f', (10, -10)],
|
||||
['fatiguelife', (-1, )],
|
||||
['fisk', (-1, )],
|
||||
['foldcauchy', (-1, )],
|
||||
['foldnorm', (-1, )],
|
||||
['genlogistic', (-1, )],
|
||||
['gennorm', (-1, )],
|
||||
['genpareto', (np.inf, )],
|
||||
['genexpon', (1, 2, -3)],
|
||||
['genextreme', (np.inf, )],
|
||||
['genhyperbolic', (0.5, -0.5, -1.5,)],
|
||||
['gausshyper', (1, 2, 3, -4)],
|
||||
['gamma', (-1, )],
|
||||
['gengamma', (-1, 0)],
|
||||
['genhalflogistic', (-1, )],
|
||||
['geninvgauss', (1, 0)],
|
||||
['gibrat', ()],
|
||||
['gompertz', (-1, )],
|
||||
['gumbel_r', ()],
|
||||
['gumbel_l', ()],
|
||||
['halfcauchy', ()],
|
||||
['halflogistic', ()],
|
||||
['halfnorm', ()],
|
||||
['halfgennorm', (-1, )],
|
||||
['hypsecant', ()],
|
||||
['invgamma', (-1, )],
|
||||
['invgauss', (-1, )],
|
||||
['invweibull', (-1, )],
|
||||
['irwinhall', (-1,)],
|
||||
['irwinhall', (0,)],
|
||||
['irwinhall', (2.5,)],
|
||||
['jf_skew_t', (-1, 0)],
|
||||
['johnsonsb', (1, -2)],
|
||||
['johnsonsu', (1, -2)],
|
||||
['kappa4', (np.nan, 0)],
|
||||
['kappa3', (-1, )],
|
||||
['ksone', (-1, )],
|
||||
['kstwo', (-1, )],
|
||||
['kstwobign', ()],
|
||||
['laplace', ()],
|
||||
['laplace_asymmetric', (-1, )],
|
||||
['levy', ()],
|
||||
['levy_l', ()],
|
||||
['levy_stable', (-1, 1)],
|
||||
['logistic', ()],
|
||||
['loggamma', (-1, )],
|
||||
['loglaplace', (-1, )],
|
||||
['lognorm', (-1, )],
|
||||
['loguniform', (10, 5)],
|
||||
['lomax', (-1, )],
|
||||
['maxwell', ()],
|
||||
['mielke', (1, -2)],
|
||||
['moyal', ()],
|
||||
['nakagami', (-1, )],
|
||||
['ncx2', (-1, 2)],
|
||||
['ncf', (10, 20, -1)],
|
||||
['nct', (-1, 2)],
|
||||
['norm', ()],
|
||||
['norminvgauss', (5, -10)],
|
||||
['pareto', (-1, )],
|
||||
['pearson3', (np.nan, )],
|
||||
['powerlaw', (-1, )],
|
||||
['powerlognorm', (1, -2)],
|
||||
['powernorm', (-1, )],
|
||||
['rdist', (-1, )],
|
||||
['rayleigh', ()],
|
||||
['rice', (-1, )],
|
||||
['recipinvgauss', (-1, )],
|
||||
['semicircular', ()],
|
||||
['skewnorm', (np.inf, )],
|
||||
['studentized_range', (-1, 1)],
|
||||
['rel_breitwigner', (-2, )],
|
||||
['t', (-1, )],
|
||||
['trapezoid', (0, 2)],
|
||||
['triang', (2, )],
|
||||
['truncexpon', (-1, )],
|
||||
['truncnorm', (10, 5)],
|
||||
['truncpareto', (-1, 5)],
|
||||
['truncpareto', (1.8, .5)],
|
||||
['truncweibull_min', (-2.5, 0.25, 1.75)],
|
||||
['tukeylambda', (np.nan, )],
|
||||
['uniform', ()],
|
||||
['vonmises', (-1, )],
|
||||
['vonmises_line', (-1, )],
|
||||
['wald', ()],
|
||||
['weibull_min', (-1, )],
|
||||
['weibull_max', (-1, )],
|
||||
['wrapcauchy', (2, )],
|
||||
['reciprocal', (15, 10)],
|
||||
['skewcauchy', (2, )]
|
||||
]
|
||||
426
venv/lib/python3.12/site-packages/scipy/stats/_entropy.py
Normal file
426
venv/lib/python3.12/site-packages/scipy/stats/_entropy.py
Normal file
@ -0,0 +1,426 @@
|
||||
"""
|
||||
Created on Fri Apr 2 09:06:05 2021
|
||||
|
||||
@author: matth
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import math
|
||||
import numpy as np
|
||||
from scipy import special
|
||||
from ._axis_nan_policy import _axis_nan_policy_factory, _broadcast_arrays
|
||||
from scipy._lib._array_api import array_namespace
|
||||
|
||||
__all__ = ['entropy', 'differential_entropy']
|
||||
|
||||
|
||||
@_axis_nan_policy_factory(
|
||||
lambda x: x,
|
||||
n_samples=lambda kwgs: (
|
||||
2 if ("qk" in kwgs and kwgs["qk"] is not None)
|
||||
else 1
|
||||
),
|
||||
n_outputs=1, result_to_tuple=lambda x: (x,), paired=True,
|
||||
too_small=-1 # entropy doesn't have too small inputs
|
||||
)
|
||||
def entropy(pk: np.typing.ArrayLike,
|
||||
qk: np.typing.ArrayLike | None = None,
|
||||
base: float | None = None,
|
||||
axis: int = 0
|
||||
) -> np.number | np.ndarray:
|
||||
"""
|
||||
Calculate the Shannon entropy/relative entropy of given distribution(s).
|
||||
|
||||
If only probabilities `pk` are given, the Shannon entropy is calculated as
|
||||
``H = -sum(pk * log(pk))``.
|
||||
|
||||
If `qk` is not None, then compute the relative entropy
|
||||
``D = sum(pk * log(pk / qk))``. This quantity is also known
|
||||
as the Kullback-Leibler divergence.
|
||||
|
||||
This routine will normalize `pk` and `qk` if they don't sum to 1.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
pk : array_like
|
||||
Defines the (discrete) distribution. Along each axis-slice of ``pk``,
|
||||
element ``i`` is the (possibly unnormalized) probability of event
|
||||
``i``.
|
||||
qk : array_like, optional
|
||||
Sequence against which the relative entropy is computed. Should be in
|
||||
the same format as `pk`.
|
||||
base : float, optional
|
||||
The logarithmic base to use, defaults to ``e`` (natural logarithm).
|
||||
axis : int, optional
|
||||
The axis along which the entropy is calculated. Default is 0.
|
||||
|
||||
Returns
|
||||
-------
|
||||
S : {float, array_like}
|
||||
The calculated entropy.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Informally, the Shannon entropy quantifies the expected uncertainty
|
||||
inherent in the possible outcomes of a discrete random variable.
|
||||
For example,
|
||||
if messages consisting of sequences of symbols from a set are to be
|
||||
encoded and transmitted over a noiseless channel, then the Shannon entropy
|
||||
``H(pk)`` gives a tight lower bound for the average number of units of
|
||||
information needed per symbol if the symbols occur with frequencies
|
||||
governed by the discrete distribution `pk` [1]_. The choice of base
|
||||
determines the choice of units; e.g., ``e`` for nats, ``2`` for bits, etc.
|
||||
|
||||
The relative entropy, ``D(pk|qk)``, quantifies the increase in the average
|
||||
number of units of information needed per symbol if the encoding is
|
||||
optimized for the probability distribution `qk` instead of the true
|
||||
distribution `pk`. Informally, the relative entropy quantifies the expected
|
||||
excess in surprise experienced if one believes the true distribution is
|
||||
`qk` when it is actually `pk`.
|
||||
|
||||
A related quantity, the cross entropy ``CE(pk, qk)``, satisfies the
|
||||
equation ``CE(pk, qk) = H(pk) + D(pk|qk)`` and can also be calculated with
|
||||
the formula ``CE = -sum(pk * log(qk))``. It gives the average
|
||||
number of units of information needed per symbol if an encoding is
|
||||
optimized for the probability distribution `qk` when the true distribution
|
||||
is `pk`. It is not computed directly by `entropy`, but it can be computed
|
||||
using two calls to the function (see Examples).
|
||||
|
||||
See [2]_ for more information.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Shannon, C.E. (1948), A Mathematical Theory of Communication.
|
||||
Bell System Technical Journal, 27: 379-423.
|
||||
https://doi.org/10.1002/j.1538-7305.1948.tb01338.x
|
||||
.. [2] Thomas M. Cover and Joy A. Thomas. 2006. Elements of Information
|
||||
Theory (Wiley Series in Telecommunications and Signal Processing).
|
||||
Wiley-Interscience, USA.
|
||||
|
||||
|
||||
Examples
|
||||
--------
|
||||
The outcome of a fair coin is the most uncertain:
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats import entropy
|
||||
>>> base = 2 # work in units of bits
|
||||
>>> pk = np.array([1/2, 1/2]) # fair coin
|
||||
>>> H = entropy(pk, base=base)
|
||||
>>> H
|
||||
1.0
|
||||
>>> H == -np.sum(pk * np.log(pk)) / np.log(base)
|
||||
True
|
||||
|
||||
The outcome of a biased coin is less uncertain:
|
||||
|
||||
>>> qk = np.array([9/10, 1/10]) # biased coin
|
||||
>>> entropy(qk, base=base)
|
||||
0.46899559358928117
|
||||
|
||||
The relative entropy between the fair coin and biased coin is calculated
|
||||
as:
|
||||
|
||||
>>> D = entropy(pk, qk, base=base)
|
||||
>>> D
|
||||
0.7369655941662062
|
||||
>>> D == np.sum(pk * np.log(pk/qk)) / np.log(base)
|
||||
True
|
||||
|
||||
The cross entropy can be calculated as the sum of the entropy and
|
||||
relative entropy`:
|
||||
|
||||
>>> CE = entropy(pk, base=base) + entropy(pk, qk, base=base)
|
||||
>>> CE
|
||||
1.736965594166206
|
||||
>>> CE == -np.sum(pk * np.log(qk)) / np.log(base)
|
||||
True
|
||||
|
||||
"""
|
||||
if base is not None and base <= 0:
|
||||
raise ValueError("`base` must be a positive number or `None`.")
|
||||
|
||||
xp = array_namespace(pk) if qk is None else array_namespace(pk, qk)
|
||||
|
||||
pk = xp.asarray(pk)
|
||||
with np.errstate(invalid='ignore'):
|
||||
pk = 1.0*pk / xp.sum(pk, axis=axis, keepdims=True) # type: ignore[operator]
|
||||
if qk is None:
|
||||
vec = special.entr(pk)
|
||||
else:
|
||||
qk = xp.asarray(qk)
|
||||
pk, qk = _broadcast_arrays((pk, qk), axis=None, xp=xp) # don't ignore any axes
|
||||
sum_kwargs = dict(axis=axis, keepdims=True)
|
||||
qk = 1.0*qk / xp.sum(qk, **sum_kwargs) # type: ignore[operator, call-overload]
|
||||
vec = special.rel_entr(pk, qk)
|
||||
S = xp.sum(vec, axis=axis)
|
||||
if base is not None:
|
||||
S /= math.log(base)
|
||||
return S
|
||||
|
||||
|
||||
def _differential_entropy_is_too_small(samples, kwargs, axis=-1):
|
||||
values = samples[0]
|
||||
n = values.shape[axis]
|
||||
window_length = kwargs.get("window_length",
|
||||
math.floor(math.sqrt(n) + 0.5))
|
||||
if not 2 <= 2 * window_length < n:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
@_axis_nan_policy_factory(
|
||||
lambda x: x, n_outputs=1, result_to_tuple=lambda x: (x,),
|
||||
too_small=_differential_entropy_is_too_small
|
||||
)
|
||||
def differential_entropy(
|
||||
values: np.typing.ArrayLike,
|
||||
*,
|
||||
window_length: int | None = None,
|
||||
base: float | None = None,
|
||||
axis: int = 0,
|
||||
method: str = "auto",
|
||||
) -> np.number | np.ndarray:
|
||||
r"""Given a sample of a distribution, estimate the differential entropy.
|
||||
|
||||
Several estimation methods are available using the `method` parameter. By
|
||||
default, a method is selected based the size of the sample.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : sequence
|
||||
Sample from a continuous distribution.
|
||||
window_length : int, optional
|
||||
Window length for computing Vasicek estimate. Must be an integer
|
||||
between 1 and half of the sample size. If ``None`` (the default), it
|
||||
uses the heuristic value
|
||||
|
||||
.. math::
|
||||
\left \lfloor \sqrt{n} + 0.5 \right \rfloor
|
||||
|
||||
where :math:`n` is the sample size. This heuristic was originally
|
||||
proposed in [2]_ and has become common in the literature.
|
||||
base : float, optional
|
||||
The logarithmic base to use, defaults to ``e`` (natural logarithm).
|
||||
axis : int, optional
|
||||
The axis along which the differential entropy is calculated.
|
||||
Default is 0.
|
||||
method : {'vasicek', 'van es', 'ebrahimi', 'correa', 'auto'}, optional
|
||||
The method used to estimate the differential entropy from the sample.
|
||||
Default is ``'auto'``. See Notes for more information.
|
||||
|
||||
Returns
|
||||
-------
|
||||
entropy : float
|
||||
The calculated differential entropy.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This function will converge to the true differential entropy in the limit
|
||||
|
||||
.. math::
|
||||
n \to \infty, \quad m \to \infty, \quad \frac{m}{n} \to 0
|
||||
|
||||
The optimal choice of ``window_length`` for a given sample size depends on
|
||||
the (unknown) distribution. Typically, the smoother the density of the
|
||||
distribution, the larger the optimal value of ``window_length`` [1]_.
|
||||
|
||||
The following options are available for the `method` parameter.
|
||||
|
||||
* ``'vasicek'`` uses the estimator presented in [1]_. This is
|
||||
one of the first and most influential estimators of differential entropy.
|
||||
* ``'van es'`` uses the bias-corrected estimator presented in [3]_, which
|
||||
is not only consistent but, under some conditions, asymptotically normal.
|
||||
* ``'ebrahimi'`` uses an estimator presented in [4]_, which was shown
|
||||
in simulation to have smaller bias and mean squared error than
|
||||
the Vasicek estimator.
|
||||
* ``'correa'`` uses the estimator presented in [5]_ based on local linear
|
||||
regression. In a simulation study, it had consistently smaller mean
|
||||
square error than the Vasiceck estimator, but it is more expensive to
|
||||
compute.
|
||||
* ``'auto'`` selects the method automatically (default). Currently,
|
||||
this selects ``'van es'`` for very small samples (<10), ``'ebrahimi'``
|
||||
for moderate sample sizes (11-1000), and ``'vasicek'`` for larger
|
||||
samples, but this behavior is subject to change in future versions.
|
||||
|
||||
All estimators are implemented as described in [6]_.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Vasicek, O. (1976). A test for normality based on sample entropy.
|
||||
Journal of the Royal Statistical Society:
|
||||
Series B (Methodological), 38(1), 54-59.
|
||||
.. [2] Crzcgorzewski, P., & Wirczorkowski, R. (1999). Entropy-based
|
||||
goodness-of-fit test for exponentiality. Communications in
|
||||
Statistics-Theory and Methods, 28(5), 1183-1202.
|
||||
.. [3] Van Es, B. (1992). Estimating functionals related to a density by a
|
||||
class of statistics based on spacings. Scandinavian Journal of
|
||||
Statistics, 61-72.
|
||||
.. [4] Ebrahimi, N., Pflughoeft, K., & Soofi, E. S. (1994). Two measures
|
||||
of sample entropy. Statistics & Probability Letters, 20(3), 225-234.
|
||||
.. [5] Correa, J. C. (1995). A new estimator of entropy. Communications
|
||||
in Statistics-Theory and Methods, 24(10), 2439-2449.
|
||||
.. [6] Noughabi, H. A. (2015). Entropy Estimation Using Numerical Methods.
|
||||
Annals of Data Science, 2(2), 231-241.
|
||||
https://link.springer.com/article/10.1007/s40745-015-0045-9
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats import differential_entropy, norm
|
||||
|
||||
Entropy of a standard normal distribution:
|
||||
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> values = rng.standard_normal(100)
|
||||
>>> differential_entropy(values)
|
||||
1.3407817436640392
|
||||
|
||||
Compare with the true entropy:
|
||||
|
||||
>>> float(norm.entropy())
|
||||
1.4189385332046727
|
||||
|
||||
For several sample sizes between 5 and 1000, compare the accuracy of
|
||||
the ``'vasicek'``, ``'van es'``, and ``'ebrahimi'`` methods. Specifically,
|
||||
compare the root mean squared error (over 1000 trials) between the estimate
|
||||
and the true differential entropy of the distribution.
|
||||
|
||||
>>> from scipy import stats
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>>
|
||||
>>>
|
||||
>>> def rmse(res, expected):
|
||||
... '''Root mean squared error'''
|
||||
... return np.sqrt(np.mean((res - expected)**2))
|
||||
>>>
|
||||
>>>
|
||||
>>> a, b = np.log10(5), np.log10(1000)
|
||||
>>> ns = np.round(np.logspace(a, b, 10)).astype(int)
|
||||
>>> reps = 1000 # number of repetitions for each sample size
|
||||
>>> expected = stats.expon.entropy()
|
||||
>>>
|
||||
>>> method_errors = {'vasicek': [], 'van es': [], 'ebrahimi': []}
|
||||
>>> for method in method_errors:
|
||||
... for n in ns:
|
||||
... rvs = stats.expon.rvs(size=(reps, n), random_state=rng)
|
||||
... res = stats.differential_entropy(rvs, method=method, axis=-1)
|
||||
... error = rmse(res, expected)
|
||||
... method_errors[method].append(error)
|
||||
>>>
|
||||
>>> for method, errors in method_errors.items():
|
||||
... plt.loglog(ns, errors, label=method)
|
||||
>>>
|
||||
>>> plt.legend()
|
||||
>>> plt.xlabel('sample size')
|
||||
>>> plt.ylabel('RMSE (1000 trials)')
|
||||
>>> plt.title('Entropy Estimator Error (Exponential Distribution)')
|
||||
|
||||
"""
|
||||
values = np.asarray(values)
|
||||
values = np.moveaxis(values, axis, -1)
|
||||
n = values.shape[-1] # number of observations
|
||||
|
||||
if window_length is None:
|
||||
window_length = math.floor(math.sqrt(n) + 0.5)
|
||||
|
||||
if not 2 <= 2 * window_length < n:
|
||||
raise ValueError(
|
||||
f"Window length ({window_length}) must be positive and less "
|
||||
f"than half the sample size ({n}).",
|
||||
)
|
||||
|
||||
if base is not None and base <= 0:
|
||||
raise ValueError("`base` must be a positive number or `None`.")
|
||||
|
||||
sorted_data = np.sort(values, axis=-1)
|
||||
|
||||
methods = {"vasicek": _vasicek_entropy,
|
||||
"van es": _van_es_entropy,
|
||||
"correa": _correa_entropy,
|
||||
"ebrahimi": _ebrahimi_entropy,
|
||||
"auto": _vasicek_entropy}
|
||||
method = method.lower()
|
||||
if method not in methods:
|
||||
message = f"`method` must be one of {set(methods)}"
|
||||
raise ValueError(message)
|
||||
|
||||
if method == "auto":
|
||||
if n <= 10:
|
||||
method = 'van es'
|
||||
elif n <= 1000:
|
||||
method = 'ebrahimi'
|
||||
else:
|
||||
method = 'vasicek'
|
||||
|
||||
res = methods[method](sorted_data, window_length)
|
||||
|
||||
if base is not None:
|
||||
res /= np.log(base)
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def _pad_along_last_axis(X, m):
|
||||
"""Pad the data for computing the rolling window difference."""
|
||||
# scales a bit better than method in _vasicek_like_entropy
|
||||
shape = np.array(X.shape)
|
||||
shape[-1] = m
|
||||
Xl = np.broadcast_to(X[..., [0]], shape) # [0] vs 0 to maintain shape
|
||||
Xr = np.broadcast_to(X[..., [-1]], shape)
|
||||
return np.concatenate((Xl, X, Xr), axis=-1)
|
||||
|
||||
|
||||
def _vasicek_entropy(X, m):
|
||||
"""Compute the Vasicek estimator as described in [6] Eq. 1.3."""
|
||||
n = X.shape[-1]
|
||||
X = _pad_along_last_axis(X, m)
|
||||
differences = X[..., 2 * m:] - X[..., : -2 * m:]
|
||||
logs = np.log(n/(2*m) * differences)
|
||||
return np.mean(logs, axis=-1)
|
||||
|
||||
|
||||
def _van_es_entropy(X, m):
|
||||
"""Compute the van Es estimator as described in [6]."""
|
||||
# No equation number, but referred to as HVE_mn.
|
||||
# Typo: there should be a log within the summation.
|
||||
n = X.shape[-1]
|
||||
difference = X[..., m:] - X[..., :-m]
|
||||
term1 = 1/(n-m) * np.sum(np.log((n+1)/m * difference), axis=-1)
|
||||
k = np.arange(m, n+1)
|
||||
return term1 + np.sum(1/k) + np.log(m) - np.log(n+1)
|
||||
|
||||
|
||||
def _ebrahimi_entropy(X, m):
|
||||
"""Compute the Ebrahimi estimator as described in [6]."""
|
||||
# No equation number, but referred to as HE_mn
|
||||
n = X.shape[-1]
|
||||
X = _pad_along_last_axis(X, m)
|
||||
|
||||
differences = X[..., 2 * m:] - X[..., : -2 * m:]
|
||||
|
||||
i = np.arange(1, n+1).astype(float)
|
||||
ci = np.ones_like(i)*2
|
||||
ci[i <= m] = 1 + (i[i <= m] - 1)/m
|
||||
ci[i >= n - m + 1] = 1 + (n - i[i >= n-m+1])/m
|
||||
|
||||
logs = np.log(n * differences / (ci * m))
|
||||
return np.mean(logs, axis=-1)
|
||||
|
||||
|
||||
def _correa_entropy(X, m):
|
||||
"""Compute the Correa estimator as described in [6]."""
|
||||
# No equation number, but referred to as HC_mn
|
||||
n = X.shape[-1]
|
||||
X = _pad_along_last_axis(X, m)
|
||||
|
||||
i = np.arange(1, n+1)
|
||||
dj = np.arange(-m, m+1)[:, None]
|
||||
j = i + dj
|
||||
j0 = j + m - 1 # 0-indexed version of j
|
||||
|
||||
Xibar = np.mean(X[..., j0], axis=-2, keepdims=True)
|
||||
difference = X[..., j0] - Xibar
|
||||
num = np.sum(difference*dj, axis=-2) # dj is d-i
|
||||
den = n*np.sum(difference**2, axis=-2)
|
||||
return -np.mean(np.log(num/den), axis=-1)
|
||||
1354
venv/lib/python3.12/site-packages/scipy/stats/_fit.py
Normal file
1354
venv/lib/python3.12/site-packages/scipy/stats/_fit.py
Normal file
File diff suppressed because it is too large
Load Diff
2027
venv/lib/python3.12/site-packages/scipy/stats/_hypotests.py
Normal file
2027
venv/lib/python3.12/site-packages/scipy/stats/_hypotests.py
Normal file
File diff suppressed because it is too large
Load Diff
725
venv/lib/python3.12/site-packages/scipy/stats/_kde.py
Normal file
725
venv/lib/python3.12/site-packages/scipy/stats/_kde.py
Normal file
@ -0,0 +1,725 @@
|
||||
#-------------------------------------------------------------------------------
|
||||
#
|
||||
# Define classes for (uni/multi)-variate kernel density estimation.
|
||||
#
|
||||
# Currently, only Gaussian kernels are implemented.
|
||||
#
|
||||
# Written by: Robert Kern
|
||||
#
|
||||
# Date: 2004-08-09
|
||||
#
|
||||
# Modified: 2005-02-10 by Robert Kern.
|
||||
# Contributed to SciPy
|
||||
# 2005-10-07 by Robert Kern.
|
||||
# Some fixes to match the new scipy_core
|
||||
#
|
||||
# Copyright 2004-2005 by Enthought, Inc.
|
||||
#
|
||||
#-------------------------------------------------------------------------------
|
||||
|
||||
# Standard library imports.
|
||||
import warnings
|
||||
|
||||
# SciPy imports.
|
||||
from scipy import linalg, special
|
||||
from scipy._lib._util import check_random_state
|
||||
|
||||
from numpy import (asarray, atleast_2d, reshape, zeros, newaxis, exp, pi,
|
||||
sqrt, ravel, power, atleast_1d, squeeze, sum, transpose,
|
||||
ones, cov)
|
||||
import numpy as np
|
||||
|
||||
# Local imports.
|
||||
from . import _mvn
|
||||
from ._stats import gaussian_kernel_estimate, gaussian_kernel_estimate_log
|
||||
|
||||
|
||||
__all__ = ['gaussian_kde']
|
||||
|
||||
|
||||
class gaussian_kde:
|
||||
"""Representation of a kernel-density estimate using Gaussian kernels.
|
||||
|
||||
Kernel density estimation is a way to estimate the probability density
|
||||
function (PDF) of a random variable in a non-parametric way.
|
||||
`gaussian_kde` works for both uni-variate and multi-variate data. It
|
||||
includes automatic bandwidth determination. The estimation works best for
|
||||
a unimodal distribution; bimodal or multi-modal distributions tend to be
|
||||
oversmoothed.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dataset : array_like
|
||||
Datapoints to estimate from. In case of univariate data this is a 1-D
|
||||
array, otherwise a 2-D array with shape (# of dims, # of data).
|
||||
bw_method : str, scalar or callable, optional
|
||||
The method used to calculate the estimator bandwidth. This can be
|
||||
'scott', 'silverman', a scalar constant or a callable. If a scalar,
|
||||
this will be used directly as `kde.factor`. If a callable, it should
|
||||
take a `gaussian_kde` instance as only parameter and return a scalar.
|
||||
If None (default), 'scott' is used. See Notes for more details.
|
||||
weights : array_like, optional
|
||||
weights of datapoints. This must be the same shape as dataset.
|
||||
If None (default), the samples are assumed to be equally weighted
|
||||
|
||||
Attributes
|
||||
----------
|
||||
dataset : ndarray
|
||||
The dataset with which `gaussian_kde` was initialized.
|
||||
d : int
|
||||
Number of dimensions.
|
||||
n : int
|
||||
Number of datapoints.
|
||||
neff : int
|
||||
Effective number of datapoints.
|
||||
|
||||
.. versionadded:: 1.2.0
|
||||
factor : float
|
||||
The bandwidth factor, obtained from `kde.covariance_factor`. The square
|
||||
of `kde.factor` multiplies the covariance matrix of the data in the kde
|
||||
estimation.
|
||||
covariance : ndarray
|
||||
The covariance matrix of `dataset`, scaled by the calculated bandwidth
|
||||
(`kde.factor`).
|
||||
inv_cov : ndarray
|
||||
The inverse of `covariance`.
|
||||
|
||||
Methods
|
||||
-------
|
||||
evaluate
|
||||
__call__
|
||||
integrate_gaussian
|
||||
integrate_box_1d
|
||||
integrate_box
|
||||
integrate_kde
|
||||
pdf
|
||||
logpdf
|
||||
resample
|
||||
set_bandwidth
|
||||
covariance_factor
|
||||
|
||||
Notes
|
||||
-----
|
||||
Bandwidth selection strongly influences the estimate obtained from the KDE
|
||||
(much more so than the actual shape of the kernel). Bandwidth selection
|
||||
can be done by a "rule of thumb", by cross-validation, by "plug-in
|
||||
methods" or by other means; see [3]_, [4]_ for reviews. `gaussian_kde`
|
||||
uses a rule of thumb, the default is Scott's Rule.
|
||||
|
||||
Scott's Rule [1]_, implemented as `scotts_factor`, is::
|
||||
|
||||
n**(-1./(d+4)),
|
||||
|
||||
with ``n`` the number of data points and ``d`` the number of dimensions.
|
||||
In the case of unequally weighted points, `scotts_factor` becomes::
|
||||
|
||||
neff**(-1./(d+4)),
|
||||
|
||||
with ``neff`` the effective number of datapoints.
|
||||
Silverman's Rule [2]_, implemented as `silverman_factor`, is::
|
||||
|
||||
(n * (d + 2) / 4.)**(-1. / (d + 4)).
|
||||
|
||||
or in the case of unequally weighted points::
|
||||
|
||||
(neff * (d + 2) / 4.)**(-1. / (d + 4)).
|
||||
|
||||
Good general descriptions of kernel density estimation can be found in [1]_
|
||||
and [2]_, the mathematics for this multi-dimensional implementation can be
|
||||
found in [1]_.
|
||||
|
||||
With a set of weighted samples, the effective number of datapoints ``neff``
|
||||
is defined by::
|
||||
|
||||
neff = sum(weights)^2 / sum(weights^2)
|
||||
|
||||
as detailed in [5]_.
|
||||
|
||||
`gaussian_kde` does not currently support data that lies in a
|
||||
lower-dimensional subspace of the space in which it is expressed. For such
|
||||
data, consider performing principle component analysis / dimensionality
|
||||
reduction and using `gaussian_kde` with the transformed data.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and
|
||||
Visualization", John Wiley & Sons, New York, Chicester, 1992.
|
||||
.. [2] B.W. Silverman, "Density Estimation for Statistics and Data
|
||||
Analysis", Vol. 26, Monographs on Statistics and Applied Probability,
|
||||
Chapman and Hall, London, 1986.
|
||||
.. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A
|
||||
Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993.
|
||||
.. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel
|
||||
conditional density estimation", Computational Statistics & Data
|
||||
Analysis, Vol. 36, pp. 279-298, 2001.
|
||||
.. [5] Gray P. G., 1969, Journal of the Royal Statistical Society.
|
||||
Series A (General), 132, 272
|
||||
|
||||
Examples
|
||||
--------
|
||||
Generate some random two-dimensional data:
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> def measure(n):
|
||||
... "Measurement model, return two coupled measurements."
|
||||
... m1 = np.random.normal(size=n)
|
||||
... m2 = np.random.normal(scale=0.5, size=n)
|
||||
... return m1+m2, m1-m2
|
||||
|
||||
>>> m1, m2 = measure(2000)
|
||||
>>> xmin = m1.min()
|
||||
>>> xmax = m1.max()
|
||||
>>> ymin = m2.min()
|
||||
>>> ymax = m2.max()
|
||||
|
||||
Perform a kernel density estimate on the data:
|
||||
|
||||
>>> X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
|
||||
>>> positions = np.vstack([X.ravel(), Y.ravel()])
|
||||
>>> values = np.vstack([m1, m2])
|
||||
>>> kernel = stats.gaussian_kde(values)
|
||||
>>> Z = np.reshape(kernel(positions).T, X.shape)
|
||||
|
||||
Plot the results:
|
||||
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> fig, ax = plt.subplots()
|
||||
>>> ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
|
||||
... extent=[xmin, xmax, ymin, ymax])
|
||||
>>> ax.plot(m1, m2, 'k.', markersize=2)
|
||||
>>> ax.set_xlim([xmin, xmax])
|
||||
>>> ax.set_ylim([ymin, ymax])
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
def __init__(self, dataset, bw_method=None, weights=None):
|
||||
self.dataset = atleast_2d(asarray(dataset))
|
||||
if not self.dataset.size > 1:
|
||||
raise ValueError("`dataset` input should have multiple elements.")
|
||||
|
||||
self.d, self.n = self.dataset.shape
|
||||
|
||||
if weights is not None:
|
||||
self._weights = atleast_1d(weights).astype(float)
|
||||
self._weights /= sum(self._weights)
|
||||
if self.weights.ndim != 1:
|
||||
raise ValueError("`weights` input should be one-dimensional.")
|
||||
if len(self._weights) != self.n:
|
||||
raise ValueError("`weights` input should be of length n")
|
||||
self._neff = 1/sum(self._weights**2)
|
||||
|
||||
# This can be converted to a warning once gh-10205 is resolved
|
||||
if self.d > self.n:
|
||||
msg = ("Number of dimensions is greater than number of samples. "
|
||||
"This results in a singular data covariance matrix, which "
|
||||
"cannot be treated using the algorithms implemented in "
|
||||
"`gaussian_kde`. Note that `gaussian_kde` interprets each "
|
||||
"*column* of `dataset` to be a point; consider transposing "
|
||||
"the input to `dataset`.")
|
||||
raise ValueError(msg)
|
||||
|
||||
try:
|
||||
self.set_bandwidth(bw_method=bw_method)
|
||||
except linalg.LinAlgError as e:
|
||||
msg = ("The data appears to lie in a lower-dimensional subspace "
|
||||
"of the space in which it is expressed. This has resulted "
|
||||
"in a singular data covariance matrix, which cannot be "
|
||||
"treated using the algorithms implemented in "
|
||||
"`gaussian_kde`. Consider performing principle component "
|
||||
"analysis / dimensionality reduction and using "
|
||||
"`gaussian_kde` with the transformed data.")
|
||||
raise linalg.LinAlgError(msg) from e
|
||||
|
||||
def evaluate(self, points):
|
||||
"""Evaluate the estimated pdf on a set of points.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
points : (# of dimensions, # of points)-array
|
||||
Alternatively, a (# of dimensions,) vector can be passed in and
|
||||
treated as a single point.
|
||||
|
||||
Returns
|
||||
-------
|
||||
values : (# of points,)-array
|
||||
The values at each point.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError : if the dimensionality of the input points is different than
|
||||
the dimensionality of the KDE.
|
||||
|
||||
"""
|
||||
points = atleast_2d(asarray(points))
|
||||
|
||||
d, m = points.shape
|
||||
if d != self.d:
|
||||
if d == 1 and m == self.d:
|
||||
# points was passed in as a row vector
|
||||
points = reshape(points, (self.d, 1))
|
||||
m = 1
|
||||
else:
|
||||
msg = (f"points have dimension {d}, "
|
||||
f"dataset has dimension {self.d}")
|
||||
raise ValueError(msg)
|
||||
|
||||
output_dtype, spec = _get_output_dtype(self.covariance, points)
|
||||
result = gaussian_kernel_estimate[spec](
|
||||
self.dataset.T, self.weights[:, None],
|
||||
points.T, self.cho_cov, output_dtype)
|
||||
|
||||
return result[:, 0]
|
||||
|
||||
__call__ = evaluate
|
||||
|
||||
def integrate_gaussian(self, mean, cov):
|
||||
"""
|
||||
Multiply estimated density by a multivariate Gaussian and integrate
|
||||
over the whole space.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mean : aray_like
|
||||
A 1-D array, specifying the mean of the Gaussian.
|
||||
cov : array_like
|
||||
A 2-D array, specifying the covariance matrix of the Gaussian.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : scalar
|
||||
The value of the integral.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the mean or covariance of the input Gaussian differs from
|
||||
the KDE's dimensionality.
|
||||
|
||||
"""
|
||||
mean = atleast_1d(squeeze(mean))
|
||||
cov = atleast_2d(cov)
|
||||
|
||||
if mean.shape != (self.d,):
|
||||
raise ValueError("mean does not have dimension %s" % self.d)
|
||||
if cov.shape != (self.d, self.d):
|
||||
raise ValueError("covariance does not have dimension %s" % self.d)
|
||||
|
||||
# make mean a column vector
|
||||
mean = mean[:, newaxis]
|
||||
|
||||
sum_cov = self.covariance + cov
|
||||
|
||||
# This will raise LinAlgError if the new cov matrix is not s.p.d
|
||||
# cho_factor returns (ndarray, bool) where bool is a flag for whether
|
||||
# or not ndarray is upper or lower triangular
|
||||
sum_cov_chol = linalg.cho_factor(sum_cov)
|
||||
|
||||
diff = self.dataset - mean
|
||||
tdiff = linalg.cho_solve(sum_cov_chol, diff)
|
||||
|
||||
sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
|
||||
norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
|
||||
|
||||
energies = sum(diff * tdiff, axis=0) / 2.0
|
||||
result = sum(exp(-energies)*self.weights, axis=0) / norm_const
|
||||
|
||||
return result
|
||||
|
||||
def integrate_box_1d(self, low, high):
|
||||
"""
|
||||
Computes the integral of a 1D pdf between two bounds.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
low : scalar
|
||||
Lower bound of integration.
|
||||
high : scalar
|
||||
Upper bound of integration.
|
||||
|
||||
Returns
|
||||
-------
|
||||
value : scalar
|
||||
The result of the integral.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the KDE is over more than one dimension.
|
||||
|
||||
"""
|
||||
if self.d != 1:
|
||||
raise ValueError("integrate_box_1d() only handles 1D pdfs")
|
||||
|
||||
stdev = ravel(sqrt(self.covariance))[0]
|
||||
|
||||
normalized_low = ravel((low - self.dataset) / stdev)
|
||||
normalized_high = ravel((high - self.dataset) / stdev)
|
||||
|
||||
value = np.sum(self.weights*(
|
||||
special.ndtr(normalized_high) -
|
||||
special.ndtr(normalized_low)))
|
||||
return value
|
||||
|
||||
def integrate_box(self, low_bounds, high_bounds, maxpts=None):
|
||||
"""Computes the integral of a pdf over a rectangular interval.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
low_bounds : array_like
|
||||
A 1-D array containing the lower bounds of integration.
|
||||
high_bounds : array_like
|
||||
A 1-D array containing the upper bounds of integration.
|
||||
maxpts : int, optional
|
||||
The maximum number of points to use for integration.
|
||||
|
||||
Returns
|
||||
-------
|
||||
value : scalar
|
||||
The result of the integral.
|
||||
|
||||
"""
|
||||
if maxpts is not None:
|
||||
extra_kwds = {'maxpts': maxpts}
|
||||
else:
|
||||
extra_kwds = {}
|
||||
|
||||
value, inform = _mvn.mvnun_weighted(low_bounds, high_bounds,
|
||||
self.dataset, self.weights,
|
||||
self.covariance, **extra_kwds)
|
||||
if inform:
|
||||
msg = ('An integral in _mvn.mvnun requires more points than %s' %
|
||||
(self.d * 1000))
|
||||
warnings.warn(msg, stacklevel=2)
|
||||
|
||||
return value
|
||||
|
||||
def integrate_kde(self, other):
|
||||
"""
|
||||
Computes the integral of the product of this kernel density estimate
|
||||
with another.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
other : gaussian_kde instance
|
||||
The other kde.
|
||||
|
||||
Returns
|
||||
-------
|
||||
value : scalar
|
||||
The result of the integral.
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If the KDEs have different dimensionality.
|
||||
|
||||
"""
|
||||
if other.d != self.d:
|
||||
raise ValueError("KDEs are not the same dimensionality")
|
||||
|
||||
# we want to iterate over the smallest number of points
|
||||
if other.n < self.n:
|
||||
small = other
|
||||
large = self
|
||||
else:
|
||||
small = self
|
||||
large = other
|
||||
|
||||
sum_cov = small.covariance + large.covariance
|
||||
sum_cov_chol = linalg.cho_factor(sum_cov)
|
||||
result = 0.0
|
||||
for i in range(small.n):
|
||||
mean = small.dataset[:, i, newaxis]
|
||||
diff = large.dataset - mean
|
||||
tdiff = linalg.cho_solve(sum_cov_chol, diff)
|
||||
|
||||
energies = sum(diff * tdiff, axis=0) / 2.0
|
||||
result += sum(exp(-energies)*large.weights, axis=0)*small.weights[i]
|
||||
|
||||
sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
|
||||
norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
|
||||
|
||||
result /= norm_const
|
||||
|
||||
return result
|
||||
|
||||
def resample(self, size=None, seed=None):
|
||||
"""Randomly sample a dataset from the estimated pdf.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
size : int, optional
|
||||
The number of samples to draw. If not provided, then the size is
|
||||
the same as the effective number of samples in the underlying
|
||||
dataset.
|
||||
seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
|
||||
If `seed` is None (or `np.random`), the `numpy.random.RandomState`
|
||||
singleton is used.
|
||||
If `seed` is an int, a new ``RandomState`` instance is used,
|
||||
seeded with `seed`.
|
||||
If `seed` is already a ``Generator`` or ``RandomState`` instance then
|
||||
that instance is used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
resample : (self.d, `size`) ndarray
|
||||
The sampled dataset.
|
||||
|
||||
""" # numpy/numpydoc#87 # noqa: E501
|
||||
if size is None:
|
||||
size = int(self.neff)
|
||||
|
||||
random_state = check_random_state(seed)
|
||||
norm = transpose(random_state.multivariate_normal(
|
||||
zeros((self.d,), float), self.covariance, size=size
|
||||
))
|
||||
indices = random_state.choice(self.n, size=size, p=self.weights)
|
||||
means = self.dataset[:, indices]
|
||||
|
||||
return means + norm
|
||||
|
||||
def scotts_factor(self):
|
||||
"""Compute Scott's factor.
|
||||
|
||||
Returns
|
||||
-------
|
||||
s : float
|
||||
Scott's factor.
|
||||
"""
|
||||
return power(self.neff, -1./(self.d+4))
|
||||
|
||||
def silverman_factor(self):
|
||||
"""Compute the Silverman factor.
|
||||
|
||||
Returns
|
||||
-------
|
||||
s : float
|
||||
The silverman factor.
|
||||
"""
|
||||
return power(self.neff*(self.d+2.0)/4.0, -1./(self.d+4))
|
||||
|
||||
# Default method to calculate bandwidth, can be overwritten by subclass
|
||||
covariance_factor = scotts_factor
|
||||
covariance_factor.__doc__ = """Computes the coefficient (`kde.factor`) that
|
||||
multiplies the data covariance matrix to obtain the kernel covariance
|
||||
matrix. The default is `scotts_factor`. A subclass can overwrite this
|
||||
method to provide a different method, or set it through a call to
|
||||
`kde.set_bandwidth`."""
|
||||
|
||||
def set_bandwidth(self, bw_method=None):
|
||||
"""Compute the estimator bandwidth with given method.
|
||||
|
||||
The new bandwidth calculated after a call to `set_bandwidth` is used
|
||||
for subsequent evaluations of the estimated density.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
bw_method : str, scalar or callable, optional
|
||||
The method used to calculate the estimator bandwidth. This can be
|
||||
'scott', 'silverman', a scalar constant or a callable. If a
|
||||
scalar, this will be used directly as `kde.factor`. If a callable,
|
||||
it should take a `gaussian_kde` instance as only parameter and
|
||||
return a scalar. If None (default), nothing happens; the current
|
||||
`kde.covariance_factor` method is kept.
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. versionadded:: 0.11
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> import scipy.stats as stats
|
||||
>>> x1 = np.array([-7, -5, 1, 4, 5.])
|
||||
>>> kde = stats.gaussian_kde(x1)
|
||||
>>> xs = np.linspace(-10, 10, num=50)
|
||||
>>> y1 = kde(xs)
|
||||
>>> kde.set_bandwidth(bw_method='silverman')
|
||||
>>> y2 = kde(xs)
|
||||
>>> kde.set_bandwidth(bw_method=kde.factor / 3.)
|
||||
>>> y3 = kde(xs)
|
||||
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> fig, ax = plt.subplots()
|
||||
>>> ax.plot(x1, np.full(x1.shape, 1 / (4. * x1.size)), 'bo',
|
||||
... label='Data points (rescaled)')
|
||||
>>> ax.plot(xs, y1, label='Scott (default)')
|
||||
>>> ax.plot(xs, y2, label='Silverman')
|
||||
>>> ax.plot(xs, y3, label='Const (1/3 * Silverman)')
|
||||
>>> ax.legend()
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
if bw_method is None:
|
||||
pass
|
||||
elif bw_method == 'scott':
|
||||
self.covariance_factor = self.scotts_factor
|
||||
elif bw_method == 'silverman':
|
||||
self.covariance_factor = self.silverman_factor
|
||||
elif np.isscalar(bw_method) and not isinstance(bw_method, str):
|
||||
self._bw_method = 'use constant'
|
||||
self.covariance_factor = lambda: bw_method
|
||||
elif callable(bw_method):
|
||||
self._bw_method = bw_method
|
||||
self.covariance_factor = lambda: self._bw_method(self)
|
||||
else:
|
||||
msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
|
||||
"or a callable."
|
||||
raise ValueError(msg)
|
||||
|
||||
self._compute_covariance()
|
||||
|
||||
def _compute_covariance(self):
|
||||
"""Computes the covariance matrix for each Gaussian kernel using
|
||||
covariance_factor().
|
||||
"""
|
||||
self.factor = self.covariance_factor()
|
||||
# Cache covariance and Cholesky decomp of covariance
|
||||
if not hasattr(self, '_data_cho_cov'):
|
||||
self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
|
||||
bias=False,
|
||||
aweights=self.weights))
|
||||
self._data_cho_cov = linalg.cholesky(self._data_covariance,
|
||||
lower=True)
|
||||
|
||||
self.covariance = self._data_covariance * self.factor**2
|
||||
self.cho_cov = (self._data_cho_cov * self.factor).astype(np.float64)
|
||||
self.log_det = 2*np.log(np.diag(self.cho_cov
|
||||
* np.sqrt(2*pi))).sum()
|
||||
|
||||
@property
|
||||
def inv_cov(self):
|
||||
# Re-compute from scratch each time because I'm not sure how this is
|
||||
# used in the wild. (Perhaps users change the `dataset`, since it's
|
||||
# not a private attribute?) `_compute_covariance` used to recalculate
|
||||
# all these, so we'll recalculate everything now that this is a
|
||||
# a property.
|
||||
self.factor = self.covariance_factor()
|
||||
self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
|
||||
bias=False, aweights=self.weights))
|
||||
return linalg.inv(self._data_covariance) / self.factor**2
|
||||
|
||||
def pdf(self, x):
|
||||
"""
|
||||
Evaluate the estimated pdf on a provided set of points.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This is an alias for `gaussian_kde.evaluate`. See the ``evaluate``
|
||||
docstring for more details.
|
||||
|
||||
"""
|
||||
return self.evaluate(x)
|
||||
|
||||
def logpdf(self, x):
|
||||
"""
|
||||
Evaluate the log of the estimated pdf on a provided set of points.
|
||||
"""
|
||||
points = atleast_2d(x)
|
||||
|
||||
d, m = points.shape
|
||||
if d != self.d:
|
||||
if d == 1 and m == self.d:
|
||||
# points was passed in as a row vector
|
||||
points = reshape(points, (self.d, 1))
|
||||
m = 1
|
||||
else:
|
||||
msg = (f"points have dimension {d}, "
|
||||
f"dataset has dimension {self.d}")
|
||||
raise ValueError(msg)
|
||||
|
||||
output_dtype, spec = _get_output_dtype(self.covariance, points)
|
||||
result = gaussian_kernel_estimate_log[spec](
|
||||
self.dataset.T, self.weights[:, None],
|
||||
points.T, self.cho_cov, output_dtype)
|
||||
|
||||
return result[:, 0]
|
||||
|
||||
def marginal(self, dimensions):
|
||||
"""Return a marginal KDE distribution
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dimensions : int or 1-d array_like
|
||||
The dimensions of the multivariate distribution corresponding
|
||||
with the marginal variables, that is, the indices of the dimensions
|
||||
that are being retained. The other dimensions are marginalized out.
|
||||
|
||||
Returns
|
||||
-------
|
||||
marginal_kde : gaussian_kde
|
||||
An object representing the marginal distribution.
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. versionadded:: 1.10.0
|
||||
|
||||
"""
|
||||
|
||||
dims = np.atleast_1d(dimensions)
|
||||
|
||||
if not np.issubdtype(dims.dtype, np.integer):
|
||||
msg = ("Elements of `dimensions` must be integers - the indices "
|
||||
"of the marginal variables being retained.")
|
||||
raise ValueError(msg)
|
||||
|
||||
n = len(self.dataset) # number of dimensions
|
||||
original_dims = dims.copy()
|
||||
|
||||
dims[dims < 0] = n + dims[dims < 0]
|
||||
|
||||
if len(np.unique(dims)) != len(dims):
|
||||
msg = ("All elements of `dimensions` must be unique.")
|
||||
raise ValueError(msg)
|
||||
|
||||
i_invalid = (dims < 0) | (dims >= n)
|
||||
if np.any(i_invalid):
|
||||
msg = (f"Dimensions {original_dims[i_invalid]} are invalid "
|
||||
f"for a distribution in {n} dimensions.")
|
||||
raise ValueError(msg)
|
||||
|
||||
dataset = self.dataset[dims]
|
||||
weights = self.weights
|
||||
|
||||
return gaussian_kde(dataset, bw_method=self.covariance_factor(),
|
||||
weights=weights)
|
||||
|
||||
@property
|
||||
def weights(self):
|
||||
try:
|
||||
return self._weights
|
||||
except AttributeError:
|
||||
self._weights = ones(self.n)/self.n
|
||||
return self._weights
|
||||
|
||||
@property
|
||||
def neff(self):
|
||||
try:
|
||||
return self._neff
|
||||
except AttributeError:
|
||||
self._neff = 1/sum(self.weights**2)
|
||||
return self._neff
|
||||
|
||||
|
||||
def _get_output_dtype(covariance, points):
|
||||
"""
|
||||
Calculates the output dtype and the "spec" (=C type name).
|
||||
|
||||
This was necessary in order to deal with the fused types in the Cython
|
||||
routine `gaussian_kernel_estimate`. See gh-10824 for details.
|
||||
"""
|
||||
output_dtype = np.common_type(covariance, points)
|
||||
itemsize = np.dtype(output_dtype).itemsize
|
||||
if itemsize == 4:
|
||||
spec = 'float'
|
||||
elif itemsize == 8:
|
||||
spec = 'double'
|
||||
elif itemsize in (12, 16):
|
||||
spec = 'long double'
|
||||
else:
|
||||
raise ValueError(
|
||||
f"{output_dtype} has unexpected item size: {itemsize}"
|
||||
)
|
||||
|
||||
return output_dtype, spec
|
||||
600
venv/lib/python3.12/site-packages/scipy/stats/_ksstats.py
Normal file
600
venv/lib/python3.12/site-packages/scipy/stats/_ksstats.py
Normal file
@ -0,0 +1,600 @@
|
||||
# Compute the two-sided one-sample Kolmogorov-Smirnov Prob(Dn <= d) where:
|
||||
# D_n = sup_x{|F_n(x) - F(x)|},
|
||||
# F_n(x) is the empirical CDF for a sample of size n {x_i: i=1,...,n},
|
||||
# F(x) is the CDF of a probability distribution.
|
||||
#
|
||||
# Exact methods:
|
||||
# Prob(D_n >= d) can be computed via a matrix algorithm of Durbin[1]
|
||||
# or a recursion algorithm due to Pomeranz[2].
|
||||
# Marsaglia, Tsang & Wang[3] gave a computation-efficient way to perform
|
||||
# the Durbin algorithm.
|
||||
# D_n >= d <==> D_n+ >= d or D_n- >= d (the one-sided K-S statistics), hence
|
||||
# Prob(D_n >= d) = 2*Prob(D_n+ >= d) - Prob(D_n+ >= d and D_n- >= d).
|
||||
# For d > 0.5, the latter intersection probability is 0.
|
||||
#
|
||||
# Approximate methods:
|
||||
# For d close to 0.5, ignoring that intersection term may still give a
|
||||
# reasonable approximation.
|
||||
# Li-Chien[4] and Korolyuk[5] gave an asymptotic formula extending
|
||||
# Kolmogorov's initial asymptotic, suitable for large d. (See
|
||||
# scipy.special.kolmogorov for that asymptotic)
|
||||
# Pelz-Good[6] used the functional equation for Jacobi theta functions to
|
||||
# transform the Li-Chien/Korolyuk formula produce a computational formula
|
||||
# suitable for small d.
|
||||
#
|
||||
# Simard and L'Ecuyer[7] provided an algorithm to decide when to use each of
|
||||
# the above approaches and it is that which is used here.
|
||||
#
|
||||
# Other approaches:
|
||||
# Carvalho[8] optimizes Durbin's matrix algorithm for large values of d.
|
||||
# Moscovich and Nadler[9] use FFTs to compute the convolutions.
|
||||
|
||||
# References:
|
||||
# [1] Durbin J (1968).
|
||||
# "The Probability that the Sample Distribution Function Lies Between Two
|
||||
# Parallel Straight Lines."
|
||||
# Annals of Mathematical Statistics, 39, 398-411.
|
||||
# [2] Pomeranz J (1974).
|
||||
# "Exact Cumulative Distribution of the Kolmogorov-Smirnov Statistic for
|
||||
# Small Samples (Algorithm 487)."
|
||||
# Communications of the ACM, 17(12), 703-704.
|
||||
# [3] Marsaglia G, Tsang WW, Wang J (2003).
|
||||
# "Evaluating Kolmogorov's Distribution."
|
||||
# Journal of Statistical Software, 8(18), 1-4.
|
||||
# [4] LI-CHIEN, C. (1956).
|
||||
# "On the exact distribution of the statistics of A. N. Kolmogorov and
|
||||
# their asymptotic expansion."
|
||||
# Acta Matematica Sinica, 6, 55-81.
|
||||
# [5] KOROLYUK, V. S. (1960).
|
||||
# "Asymptotic analysis of the distribution of the maximum deviation in
|
||||
# the Bernoulli scheme."
|
||||
# Theor. Probability Appl., 4, 339-366.
|
||||
# [6] Pelz W, Good IJ (1976).
|
||||
# "Approximating the Lower Tail-areas of the Kolmogorov-Smirnov One-sample
|
||||
# Statistic."
|
||||
# Journal of the Royal Statistical Society, Series B, 38(2), 152-156.
|
||||
# [7] Simard, R., L'Ecuyer, P. (2011)
|
||||
# "Computing the Two-Sided Kolmogorov-Smirnov Distribution",
|
||||
# Journal of Statistical Software, Vol 39, 11, 1-18.
|
||||
# [8] Carvalho, Luis (2015)
|
||||
# "An Improved Evaluation of Kolmogorov's Distribution"
|
||||
# Journal of Statistical Software, Code Snippets; Vol 65(3), 1-8.
|
||||
# [9] Amit Moscovich, Boaz Nadler (2017)
|
||||
# "Fast calculation of boundary crossing probabilities for Poisson
|
||||
# processes",
|
||||
# Statistics & Probability Letters, Vol 123, 177-182.
|
||||
|
||||
|
||||
import numpy as np
|
||||
import scipy.special
|
||||
import scipy.special._ufuncs as scu
|
||||
from scipy._lib._finite_differences import _derivative
|
||||
|
||||
_E128 = 128
|
||||
_EP128 = np.ldexp(np.longdouble(1), _E128)
|
||||
_EM128 = np.ldexp(np.longdouble(1), -_E128)
|
||||
|
||||
_SQRT2PI = np.sqrt(2 * np.pi)
|
||||
_LOG_2PI = np.log(2 * np.pi)
|
||||
_MIN_LOG = -708
|
||||
_SQRT3 = np.sqrt(3)
|
||||
_PI_SQUARED = np.pi ** 2
|
||||
_PI_FOUR = np.pi ** 4
|
||||
_PI_SIX = np.pi ** 6
|
||||
|
||||
# [Lifted from _loggamma.pxd.] If B_m are the Bernoulli numbers,
|
||||
# then Stirling coeffs are B_{2j}/(2j)/(2j-1) for j=8,...1.
|
||||
_STIRLING_COEFFS = [-2.955065359477124183e-2, 6.4102564102564102564e-3,
|
||||
-1.9175269175269175269e-3, 8.4175084175084175084e-4,
|
||||
-5.952380952380952381e-4, 7.9365079365079365079e-4,
|
||||
-2.7777777777777777778e-3, 8.3333333333333333333e-2]
|
||||
|
||||
|
||||
def _log_nfactorial_div_n_pow_n(n):
|
||||
# Computes n! / n**n
|
||||
# = (n-1)! / n**(n-1)
|
||||
# Uses Stirling's approximation, but removes n*log(n) up-front to
|
||||
# avoid subtractive cancellation.
|
||||
# = log(n)/2 - n + log(sqrt(2pi)) + sum B_{2j}/(2j)/(2j-1)/n**(2j-1)
|
||||
rn = 1.0/n
|
||||
return np.log(n)/2 - n + _LOG_2PI/2 + rn * np.polyval(_STIRLING_COEFFS, rn/n)
|
||||
|
||||
|
||||
def _clip_prob(p):
|
||||
"""clips a probability to range 0<=p<=1."""
|
||||
return np.clip(p, 0.0, 1.0)
|
||||
|
||||
|
||||
def _select_and_clip_prob(cdfprob, sfprob, cdf=True):
|
||||
"""Selects either the CDF or SF, and then clips to range 0<=p<=1."""
|
||||
p = np.where(cdf, cdfprob, sfprob)
|
||||
return _clip_prob(p)
|
||||
|
||||
|
||||
def _kolmogn_DMTW(n, d, cdf=True):
|
||||
r"""Computes the Kolmogorov CDF: Pr(D_n <= d) using the MTW approach to
|
||||
the Durbin matrix algorithm.
|
||||
|
||||
Durbin (1968); Marsaglia, Tsang, Wang (2003). [1], [3].
|
||||
"""
|
||||
# Write d = (k-h)/n, where k is positive integer and 0 <= h < 1
|
||||
# Generate initial matrix H of size m*m where m=(2k-1)
|
||||
# Compute k-th row of (n!/n^n) * H^n, scaling intermediate results.
|
||||
# Requires memory O(m^2) and computation O(m^2 log(n)).
|
||||
# Most suitable for small m.
|
||||
|
||||
if d >= 1.0:
|
||||
return _select_and_clip_prob(1.0, 0.0, cdf)
|
||||
nd = n * d
|
||||
if nd <= 0.5:
|
||||
return _select_and_clip_prob(0.0, 1.0, cdf)
|
||||
k = int(np.ceil(nd))
|
||||
h = k - nd
|
||||
m = 2 * k - 1
|
||||
|
||||
H = np.zeros([m, m])
|
||||
|
||||
# Initialize: v is first column (and last row) of H
|
||||
# v[j] = (1-h^(j+1)/(j+1)! (except for v[-1])
|
||||
# w[j] = 1/(j)!
|
||||
# q = k-th row of H (actually i!/n^i*H^i)
|
||||
intm = np.arange(1, m + 1)
|
||||
v = 1.0 - h ** intm
|
||||
w = np.empty(m)
|
||||
fac = 1.0
|
||||
for j in intm:
|
||||
w[j - 1] = fac
|
||||
fac /= j # This might underflow. Isn't a problem.
|
||||
v[j - 1] *= fac
|
||||
tt = max(2 * h - 1.0, 0)**m - 2*h**m
|
||||
v[-1] = (1.0 + tt) * fac
|
||||
|
||||
for i in range(1, m):
|
||||
H[i - 1:, i] = w[:m - i + 1]
|
||||
H[:, 0] = v
|
||||
H[-1, :] = np.flip(v, axis=0)
|
||||
|
||||
Hpwr = np.eye(np.shape(H)[0]) # Holds intermediate powers of H
|
||||
nn = n
|
||||
expnt = 0 # Scaling of Hpwr
|
||||
Hexpnt = 0 # Scaling of H
|
||||
while nn > 0:
|
||||
if nn % 2:
|
||||
Hpwr = np.matmul(Hpwr, H)
|
||||
expnt += Hexpnt
|
||||
H = np.matmul(H, H)
|
||||
Hexpnt *= 2
|
||||
# Scale as needed.
|
||||
if np.abs(H[k - 1, k - 1]) > _EP128:
|
||||
H /= _EP128
|
||||
Hexpnt += _E128
|
||||
nn = nn // 2
|
||||
|
||||
p = Hpwr[k - 1, k - 1]
|
||||
|
||||
# Multiply by n!/n^n
|
||||
for i in range(1, n + 1):
|
||||
p = i * p / n
|
||||
if np.abs(p) < _EM128:
|
||||
p *= _EP128
|
||||
expnt -= _E128
|
||||
|
||||
# unscale
|
||||
if expnt != 0:
|
||||
p = np.ldexp(p, expnt)
|
||||
|
||||
return _select_and_clip_prob(p, 1.0-p, cdf)
|
||||
|
||||
|
||||
def _pomeranz_compute_j1j2(i, n, ll, ceilf, roundf):
|
||||
"""Compute the endpoints of the interval for row i."""
|
||||
if i == 0:
|
||||
j1, j2 = -ll - ceilf - 1, ll + ceilf - 1
|
||||
else:
|
||||
# i + 1 = 2*ip1div2 + ip1mod2
|
||||
ip1div2, ip1mod2 = divmod(i + 1, 2)
|
||||
if ip1mod2 == 0: # i is odd
|
||||
if ip1div2 == n + 1:
|
||||
j1, j2 = n - ll - ceilf - 1, n + ll + ceilf - 1
|
||||
else:
|
||||
j1, j2 = ip1div2 - 1 - ll - roundf - 1, ip1div2 + ll - 1 + ceilf - 1
|
||||
else:
|
||||
j1, j2 = ip1div2 - 1 - ll - 1, ip1div2 + ll + roundf - 1
|
||||
|
||||
return max(j1 + 2, 0), min(j2, n)
|
||||
|
||||
|
||||
def _kolmogn_Pomeranz(n, x, cdf=True):
|
||||
r"""Computes Pr(D_n <= d) using the Pomeranz recursion algorithm.
|
||||
|
||||
Pomeranz (1974) [2]
|
||||
"""
|
||||
|
||||
# V is n*(2n+2) matrix.
|
||||
# Each row is convolution of the previous row and probabilities from a
|
||||
# Poisson distribution.
|
||||
# Desired CDF probability is n! V[n-1, 2n+1] (final entry in final row).
|
||||
# Only two rows are needed at any given stage:
|
||||
# - Call them V0 and V1.
|
||||
# - Swap each iteration
|
||||
# Only a few (contiguous) entries in each row can be non-zero.
|
||||
# - Keep track of start and end (j1 and j2 below)
|
||||
# - V0s and V1s track the start in the two rows
|
||||
# Scale intermediate results as needed.
|
||||
# Only a few different Poisson distributions can occur
|
||||
t = n * x
|
||||
ll = int(np.floor(t))
|
||||
f = 1.0 * (t - ll) # fractional part of t
|
||||
g = min(f, 1.0 - f)
|
||||
ceilf = (1 if f > 0 else 0)
|
||||
roundf = (1 if f > 0.5 else 0)
|
||||
npwrs = 2 * (ll + 1) # Maximum number of powers needed in convolutions
|
||||
gpower = np.empty(npwrs) # gpower = (g/n)^m/m!
|
||||
twogpower = np.empty(npwrs) # twogpower = (2g/n)^m/m!
|
||||
onem2gpower = np.empty(npwrs) # onem2gpower = ((1-2g)/n)^m/m!
|
||||
# gpower etc are *almost* Poisson probs, just missing normalizing factor.
|
||||
|
||||
gpower[0] = 1.0
|
||||
twogpower[0] = 1.0
|
||||
onem2gpower[0] = 1.0
|
||||
expnt = 0
|
||||
g_over_n, two_g_over_n, one_minus_two_g_over_n = g/n, 2*g/n, (1 - 2*g)/n
|
||||
for m in range(1, npwrs):
|
||||
gpower[m] = gpower[m - 1] * g_over_n / m
|
||||
twogpower[m] = twogpower[m - 1] * two_g_over_n / m
|
||||
onem2gpower[m] = onem2gpower[m - 1] * one_minus_two_g_over_n / m
|
||||
|
||||
V0 = np.zeros([npwrs])
|
||||
V1 = np.zeros([npwrs])
|
||||
V1[0] = 1 # first row
|
||||
V0s, V1s = 0, 0 # start indices of the two rows
|
||||
|
||||
j1, j2 = _pomeranz_compute_j1j2(0, n, ll, ceilf, roundf)
|
||||
for i in range(1, 2 * n + 2):
|
||||
# Preserve j1, V1, V1s, V0s from last iteration
|
||||
k1 = j1
|
||||
V0, V1 = V1, V0
|
||||
V0s, V1s = V1s, V0s
|
||||
V1.fill(0.0)
|
||||
j1, j2 = _pomeranz_compute_j1j2(i, n, ll, ceilf, roundf)
|
||||
if i == 1 or i == 2 * n + 1:
|
||||
pwrs = gpower
|
||||
else:
|
||||
pwrs = (twogpower if i % 2 else onem2gpower)
|
||||
ln2 = j2 - k1 + 1
|
||||
if ln2 > 0:
|
||||
conv = np.convolve(V0[k1 - V0s:k1 - V0s + ln2], pwrs[:ln2])
|
||||
conv_start = j1 - k1 # First index to use from conv
|
||||
conv_len = j2 - j1 + 1 # Number of entries to use from conv
|
||||
V1[:conv_len] = conv[conv_start:conv_start + conv_len]
|
||||
# Scale to avoid underflow.
|
||||
if 0 < np.max(V1) < _EM128:
|
||||
V1 *= _EP128
|
||||
expnt -= _E128
|
||||
V1s = V0s + j1 - k1
|
||||
|
||||
# multiply by n!
|
||||
ans = V1[n - V1s]
|
||||
for m in range(1, n + 1):
|
||||
if np.abs(ans) > _EP128:
|
||||
ans *= _EM128
|
||||
expnt += _E128
|
||||
ans *= m
|
||||
|
||||
# Undo any intermediate scaling
|
||||
if expnt != 0:
|
||||
ans = np.ldexp(ans, expnt)
|
||||
ans = _select_and_clip_prob(ans, 1.0 - ans, cdf)
|
||||
return ans
|
||||
|
||||
|
||||
def _kolmogn_PelzGood(n, x, cdf=True):
|
||||
"""Computes the Pelz-Good approximation to Prob(Dn <= x) with 0<=x<=1.
|
||||
|
||||
Start with Li-Chien, Korolyuk approximation:
|
||||
Prob(Dn <= x) ~ K0(z) + K1(z)/sqrt(n) + K2(z)/n + K3(z)/n**1.5
|
||||
where z = x*sqrt(n).
|
||||
Transform each K_(z) using Jacobi theta functions into a form suitable
|
||||
for small z.
|
||||
Pelz-Good (1976). [6]
|
||||
"""
|
||||
if x <= 0.0:
|
||||
return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
|
||||
if x >= 1.0:
|
||||
return _select_and_clip_prob(1.0, 0.0, cdf=cdf)
|
||||
|
||||
z = np.sqrt(n) * x
|
||||
zsquared, zthree, zfour, zsix = z**2, z**3, z**4, z**6
|
||||
|
||||
qlog = -_PI_SQUARED / 8 / zsquared
|
||||
if qlog < _MIN_LOG: # z ~ 0.041743441416853426
|
||||
return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
|
||||
|
||||
q = np.exp(qlog)
|
||||
|
||||
# Coefficients of terms in the sums for K1, K2 and K3
|
||||
k1a = -zsquared
|
||||
k1b = _PI_SQUARED / 4
|
||||
|
||||
k2a = 6 * zsix + 2 * zfour
|
||||
k2b = (2 * zfour - 5 * zsquared) * _PI_SQUARED / 4
|
||||
k2c = _PI_FOUR * (1 - 2 * zsquared) / 16
|
||||
|
||||
k3d = _PI_SIX * (5 - 30 * zsquared) / 64
|
||||
k3c = _PI_FOUR * (-60 * zsquared + 212 * zfour) / 16
|
||||
k3b = _PI_SQUARED * (135 * zfour - 96 * zsix) / 4
|
||||
k3a = -30 * zsix - 90 * z**8
|
||||
|
||||
K0to3 = np.zeros(4)
|
||||
# Use a Horner scheme to evaluate sum c_i q^(i^2)
|
||||
# Reduces to a sum over odd integers.
|
||||
maxk = int(np.ceil(16 * z / np.pi))
|
||||
for k in range(maxk, 0, -1):
|
||||
m = 2 * k - 1
|
||||
msquared, mfour, msix = m**2, m**4, m**6
|
||||
qpower = np.power(q, 8 * k)
|
||||
coeffs = np.array([1.0,
|
||||
k1a + k1b*msquared,
|
||||
k2a + k2b*msquared + k2c*mfour,
|
||||
k3a + k3b*msquared + k3c*mfour + k3d*msix])
|
||||
K0to3 *= qpower
|
||||
K0to3 += coeffs
|
||||
K0to3 *= q
|
||||
K0to3 *= _SQRT2PI
|
||||
# z**10 > 0 as z > 0.04
|
||||
K0to3 /= np.array([z, 6 * zfour, 72 * z**7, 6480 * z**10])
|
||||
|
||||
# Now do the other sum over the other terms, all integers k
|
||||
# K_2: (pi^2 k^2) q^(k^2),
|
||||
# K_3: (3pi^2 k^2 z^2 - pi^4 k^4)*q^(k^2)
|
||||
# Don't expect much subtractive cancellation so use direct calculation
|
||||
q = np.exp(-_PI_SQUARED / 2 / zsquared)
|
||||
ks = np.arange(maxk, 0, -1)
|
||||
ksquared = ks ** 2
|
||||
sqrt3z = _SQRT3 * z
|
||||
kspi = np.pi * ks
|
||||
qpwers = q ** ksquared
|
||||
k2extra = np.sum(ksquared * qpwers)
|
||||
k2extra *= _PI_SQUARED * _SQRT2PI/(-36 * zthree)
|
||||
K0to3[2] += k2extra
|
||||
k3extra = np.sum((sqrt3z + kspi) * (sqrt3z - kspi) * ksquared * qpwers)
|
||||
k3extra *= _PI_SQUARED * _SQRT2PI/(216 * zsix)
|
||||
K0to3[3] += k3extra
|
||||
powers_of_n = np.power(n * 1.0, np.arange(len(K0to3)) / 2.0)
|
||||
K0to3 /= powers_of_n
|
||||
|
||||
if not cdf:
|
||||
K0to3 *= -1
|
||||
K0to3[0] += 1
|
||||
|
||||
Ksum = sum(K0to3)
|
||||
return Ksum
|
||||
|
||||
|
||||
def _kolmogn(n, x, cdf=True):
|
||||
"""Computes the CDF(or SF) for the two-sided Kolmogorov-Smirnov statistic.
|
||||
|
||||
x must be of type float, n of type integer.
|
||||
|
||||
Simard & L'Ecuyer (2011) [7].
|
||||
"""
|
||||
if np.isnan(n):
|
||||
return n # Keep the same type of nan
|
||||
if int(n) != n or n <= 0:
|
||||
return np.nan
|
||||
if x >= 1.0:
|
||||
return _select_and_clip_prob(1.0, 0.0, cdf=cdf)
|
||||
if x <= 0.0:
|
||||
return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
|
||||
t = n * x
|
||||
if t <= 1.0: # Ruben-Gambino: 1/2n <= x <= 1/n
|
||||
if t <= 0.5:
|
||||
return _select_and_clip_prob(0.0, 1.0, cdf=cdf)
|
||||
if n <= 140:
|
||||
prob = np.prod(np.arange(1, n+1) * (1.0/n) * (2*t - 1))
|
||||
else:
|
||||
prob = np.exp(_log_nfactorial_div_n_pow_n(n) + n * np.log(2*t-1))
|
||||
return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
|
||||
if t >= n - 1: # Ruben-Gambino
|
||||
prob = 2 * (1.0 - x)**n
|
||||
return _select_and_clip_prob(1 - prob, prob, cdf=cdf)
|
||||
if x >= 0.5: # Exact: 2 * smirnov
|
||||
prob = 2 * scipy.special.smirnov(n, x)
|
||||
return _select_and_clip_prob(1.0 - prob, prob, cdf=cdf)
|
||||
|
||||
nxsquared = t * x
|
||||
if n <= 140:
|
||||
if nxsquared <= 0.754693:
|
||||
prob = _kolmogn_DMTW(n, x, cdf=True)
|
||||
return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
|
||||
if nxsquared <= 4:
|
||||
prob = _kolmogn_Pomeranz(n, x, cdf=True)
|
||||
return _select_and_clip_prob(prob, 1.0 - prob, cdf=cdf)
|
||||
# Now use Miller approximation of 2*smirnov
|
||||
prob = 2 * scipy.special.smirnov(n, x)
|
||||
return _select_and_clip_prob(1.0 - prob, prob, cdf=cdf)
|
||||
|
||||
# Split CDF and SF as they have different cutoffs on nxsquared.
|
||||
if not cdf:
|
||||
if nxsquared >= 370.0:
|
||||
return 0.0
|
||||
if nxsquared >= 2.2:
|
||||
prob = 2 * scipy.special.smirnov(n, x)
|
||||
return _clip_prob(prob)
|
||||
# Fall through and compute the SF as 1.0-CDF
|
||||
if nxsquared >= 18.0:
|
||||
cdfprob = 1.0
|
||||
elif n <= 100000 and n * x**1.5 <= 1.4:
|
||||
cdfprob = _kolmogn_DMTW(n, x, cdf=True)
|
||||
else:
|
||||
cdfprob = _kolmogn_PelzGood(n, x, cdf=True)
|
||||
return _select_and_clip_prob(cdfprob, 1.0 - cdfprob, cdf=cdf)
|
||||
|
||||
|
||||
def _kolmogn_p(n, x):
|
||||
"""Computes the PDF for the two-sided Kolmogorov-Smirnov statistic.
|
||||
|
||||
x must be of type float, n of type integer.
|
||||
"""
|
||||
if np.isnan(n):
|
||||
return n # Keep the same type of nan
|
||||
if int(n) != n or n <= 0:
|
||||
return np.nan
|
||||
if x >= 1.0 or x <= 0:
|
||||
return 0
|
||||
t = n * x
|
||||
if t <= 1.0:
|
||||
# Ruben-Gambino: n!/n^n * (2t-1)^n -> 2 n!/n^n * n^2 * (2t-1)^(n-1)
|
||||
if t <= 0.5:
|
||||
return 0.0
|
||||
if n <= 140:
|
||||
prd = np.prod(np.arange(1, n) * (1.0 / n) * (2 * t - 1))
|
||||
else:
|
||||
prd = np.exp(_log_nfactorial_div_n_pow_n(n) + (n-1) * np.log(2 * t - 1))
|
||||
return prd * 2 * n**2
|
||||
if t >= n - 1:
|
||||
# Ruben-Gambino : 1-2(1-x)**n -> 2n*(1-x)**(n-1)
|
||||
return 2 * (1.0 - x) ** (n-1) * n
|
||||
if x >= 0.5:
|
||||
return 2 * scipy.stats.ksone.pdf(x, n)
|
||||
|
||||
# Just take a small delta.
|
||||
# Ideally x +/- delta would stay within [i/n, (i+1)/n] for some integer a.
|
||||
# as the CDF is a piecewise degree n polynomial.
|
||||
# It has knots at 1/n, 2/n, ... (n-1)/n
|
||||
# and is not a C-infinity function at the knots
|
||||
delta = x / 2.0**16
|
||||
delta = min(delta, x - 1.0/n)
|
||||
delta = min(delta, 0.5 - x)
|
||||
|
||||
def _kk(_x):
|
||||
return kolmogn(n, _x)
|
||||
|
||||
return _derivative(_kk, x, dx=delta, order=5)
|
||||
|
||||
|
||||
def _kolmogni(n, p, q):
|
||||
"""Computes the PPF/ISF of kolmogn.
|
||||
|
||||
n of type integer, n>= 1
|
||||
p is the CDF, q the SF, p+q=1
|
||||
"""
|
||||
if np.isnan(n):
|
||||
return n # Keep the same type of nan
|
||||
if int(n) != n or n <= 0:
|
||||
return np.nan
|
||||
if p <= 0:
|
||||
return 1.0/n
|
||||
if q <= 0:
|
||||
return 1.0
|
||||
delta = np.exp((np.log(p) - scipy.special.loggamma(n+1))/n)
|
||||
if delta <= 1.0/n:
|
||||
return (delta + 1.0 / n) / 2
|
||||
x = -np.expm1(np.log(q/2.0)/n)
|
||||
if x >= 1 - 1.0/n:
|
||||
return x
|
||||
x1 = scu._kolmogci(p)/np.sqrt(n)
|
||||
x1 = min(x1, 1.0 - 1.0/n)
|
||||
|
||||
def _f(x):
|
||||
return _kolmogn(n, x) - p
|
||||
|
||||
return scipy.optimize.brentq(_f, 1.0/n, x1, xtol=1e-14)
|
||||
|
||||
|
||||
def kolmogn(n, x, cdf=True):
|
||||
"""Computes the CDF for the two-sided Kolmogorov-Smirnov distribution.
|
||||
|
||||
The two-sided Kolmogorov-Smirnov distribution has as its CDF Pr(D_n <= x),
|
||||
for a sample of size n drawn from a distribution with CDF F(t), where
|
||||
:math:`D_n &= sup_t |F_n(t) - F(t)|`, and
|
||||
:math:`F_n(t)` is the Empirical Cumulative Distribution Function of the sample.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n : integer, array_like
|
||||
the number of samples
|
||||
x : float, array_like
|
||||
The K-S statistic, float between 0 and 1
|
||||
cdf : bool, optional
|
||||
whether to compute the CDF(default=true) or the SF.
|
||||
|
||||
Returns
|
||||
-------
|
||||
cdf : ndarray
|
||||
CDF (or SF it cdf is False) at the specified locations.
|
||||
|
||||
The return value has shape the result of numpy broadcasting n and x.
|
||||
"""
|
||||
it = np.nditer([n, x, cdf, None],
|
||||
op_dtypes=[None, np.float64, np.bool_, np.float64])
|
||||
for _n, _x, _cdf, z in it:
|
||||
if np.isnan(_n):
|
||||
z[...] = _n
|
||||
continue
|
||||
if int(_n) != _n:
|
||||
raise ValueError(f'n is not integral: {_n}')
|
||||
z[...] = _kolmogn(int(_n), _x, cdf=_cdf)
|
||||
result = it.operands[-1]
|
||||
return result
|
||||
|
||||
|
||||
def kolmognp(n, x):
|
||||
"""Computes the PDF for the two-sided Kolmogorov-Smirnov distribution.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n : integer, array_like
|
||||
the number of samples
|
||||
x : float, array_like
|
||||
The K-S statistic, float between 0 and 1
|
||||
|
||||
Returns
|
||||
-------
|
||||
pdf : ndarray
|
||||
The PDF at the specified locations
|
||||
|
||||
The return value has shape the result of numpy broadcasting n and x.
|
||||
"""
|
||||
it = np.nditer([n, x, None])
|
||||
for _n, _x, z in it:
|
||||
if np.isnan(_n):
|
||||
z[...] = _n
|
||||
continue
|
||||
if int(_n) != _n:
|
||||
raise ValueError(f'n is not integral: {_n}')
|
||||
z[...] = _kolmogn_p(int(_n), _x)
|
||||
result = it.operands[-1]
|
||||
return result
|
||||
|
||||
|
||||
def kolmogni(n, q, cdf=True):
|
||||
"""Computes the PPF(or ISF) for the two-sided Kolmogorov-Smirnov distribution.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n : integer, array_like
|
||||
the number of samples
|
||||
q : float, array_like
|
||||
Probabilities, float between 0 and 1
|
||||
cdf : bool, optional
|
||||
whether to compute the PPF(default=true) or the ISF.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ppf : ndarray
|
||||
PPF (or ISF if cdf is False) at the specified locations
|
||||
|
||||
The return value has shape the result of numpy broadcasting n and x.
|
||||
"""
|
||||
it = np.nditer([n, q, cdf, None])
|
||||
for _n, _q, _cdf, z in it:
|
||||
if np.isnan(_n):
|
||||
z[...] = _n
|
||||
continue
|
||||
if int(_n) != _n:
|
||||
raise ValueError(f'n is not integral: {_n}')
|
||||
_pcdf, _psf = (_q, 1-_q) if _cdf else (1-_q, _q)
|
||||
z[...] = _kolmogni(int(_n), _pcdf, _psf)
|
||||
result = it.operands[-1]
|
||||
return result
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
494
venv/lib/python3.12/site-packages/scipy/stats/_mannwhitneyu.py
Normal file
494
venv/lib/python3.12/site-packages/scipy/stats/_mannwhitneyu.py
Normal file
@ -0,0 +1,494 @@
|
||||
import numpy as np
|
||||
from collections import namedtuple
|
||||
from scipy import special
|
||||
from scipy import stats
|
||||
from scipy.stats._stats_py import _rankdata
|
||||
from ._axis_nan_policy import _axis_nan_policy_factory
|
||||
|
||||
|
||||
def _broadcast_concatenate(x, y, axis):
|
||||
'''Broadcast then concatenate arrays, leaving concatenation axis last'''
|
||||
x = np.moveaxis(x, axis, -1)
|
||||
y = np.moveaxis(y, axis, -1)
|
||||
z = np.broadcast(x[..., 0], y[..., 0])
|
||||
x = np.broadcast_to(x, z.shape + (x.shape[-1],))
|
||||
y = np.broadcast_to(y, z.shape + (y.shape[-1],))
|
||||
z = np.concatenate((x, y), axis=-1)
|
||||
return x, y, z
|
||||
|
||||
|
||||
class _MWU:
|
||||
'''Distribution of MWU statistic under the null hypothesis'''
|
||||
|
||||
def __init__(self, n1, n2):
|
||||
self._reset(n1, n2)
|
||||
|
||||
def set_shapes(self, n1, n2):
|
||||
n1, n2 = min(n1, n2), max(n1, n2)
|
||||
if (n1, n2) == (self.n1, self.n2):
|
||||
return
|
||||
|
||||
self.n1 = n1
|
||||
self.n2 = n2
|
||||
self.s_array = np.zeros(0, dtype=int)
|
||||
self.configurations = np.zeros(0, dtype=np.uint64)
|
||||
|
||||
def reset(self):
|
||||
self._reset(self.n1, self.n2)
|
||||
|
||||
def _reset(self, n1, n2):
|
||||
self.n1 = None
|
||||
self.n2 = None
|
||||
self.set_shapes(n1, n2)
|
||||
|
||||
def pmf(self, k):
|
||||
|
||||
# In practice, `pmf` is never called with k > m*n/2.
|
||||
# If it were, we'd exploit symmetry here:
|
||||
# k = np.array(k, copy=True)
|
||||
# k2 = m*n - k
|
||||
# i = k2 < k
|
||||
# k[i] = k2[i]
|
||||
|
||||
pmfs = self.build_u_freqs_array(np.max(k))
|
||||
return pmfs[k]
|
||||
|
||||
def cdf(self, k):
|
||||
'''Cumulative distribution function'''
|
||||
|
||||
# In practice, `cdf` is never called with k > m*n/2.
|
||||
# If it were, we'd exploit symmetry here rather than in `sf`
|
||||
pmfs = self.build_u_freqs_array(np.max(k))
|
||||
cdfs = np.cumsum(pmfs)
|
||||
return cdfs[k]
|
||||
|
||||
def sf(self, k):
|
||||
'''Survival function'''
|
||||
# Note that both CDF and SF include the PMF at k. The p-value is
|
||||
# calculated from the SF and should include the mass at k, so this
|
||||
# is desirable
|
||||
|
||||
# Use the fact that the distribution is symmetric and sum from the left
|
||||
kc = np.asarray(self.n1*self.n2 - k) # complement of k
|
||||
i = k < kc
|
||||
if np.any(i):
|
||||
kc[i] = k[i]
|
||||
cdfs = np.asarray(self.cdf(kc))
|
||||
cdfs[i] = 1. - cdfs[i] + self.pmf(kc[i])
|
||||
else:
|
||||
cdfs = np.asarray(self.cdf(kc))
|
||||
return cdfs[()]
|
||||
|
||||
# build_sigma_array and build_u_freqs_array adapted from code
|
||||
# by @toobaz with permission. Thanks to @andreasloe for the suggestion.
|
||||
# See https://github.com/scipy/scipy/pull/4933#issuecomment-1898082691
|
||||
def build_sigma_array(self, a):
|
||||
n1, n2 = self.n1, self.n2
|
||||
if a + 1 <= self.s_array.size:
|
||||
return self.s_array[1:a+1]
|
||||
|
||||
s_array = np.zeros(a + 1, dtype=int)
|
||||
|
||||
for d in np.arange(1, n1 + 1):
|
||||
# All multiples of d, except 0:
|
||||
indices = np.arange(d, a + 1, d)
|
||||
# \epsilon_d = 1:
|
||||
s_array[indices] += d
|
||||
|
||||
for d in np.arange(n2 + 1, n2 + n1 + 1):
|
||||
# All multiples of d, except 0:
|
||||
indices = np.arange(d, a + 1, d)
|
||||
# \epsilon_d = -1:
|
||||
s_array[indices] -= d
|
||||
|
||||
# We don't need 0:
|
||||
self.s_array = s_array
|
||||
return s_array[1:]
|
||||
|
||||
def build_u_freqs_array(self, maxu):
|
||||
"""
|
||||
Build all the array of frequencies for u from 0 to maxu.
|
||||
Assumptions:
|
||||
n1 <= n2
|
||||
maxu <= n1 * n2 / 2
|
||||
"""
|
||||
n1, n2 = self.n1, self.n2
|
||||
total = special.binom(n1 + n2, n1)
|
||||
|
||||
if maxu + 1 <= self.configurations.size:
|
||||
return self.configurations[:maxu + 1] / total
|
||||
|
||||
s_array = self.build_sigma_array(maxu)
|
||||
|
||||
# Start working with ints, for maximum precision and efficiency:
|
||||
configurations = np.zeros(maxu + 1, dtype=np.uint64)
|
||||
configurations_is_uint = True
|
||||
uint_max = np.iinfo(np.uint64).max
|
||||
# How many ways to have U=0? 1
|
||||
configurations[0] = 1
|
||||
|
||||
for u in np.arange(1, maxu + 1):
|
||||
coeffs = s_array[u - 1::-1]
|
||||
new_val = np.dot(configurations[:u], coeffs) / u
|
||||
if new_val > uint_max and configurations_is_uint:
|
||||
# OK, we got into numbers too big for uint64.
|
||||
# So now we start working with floats.
|
||||
# By doing this since the beginning, we would have lost precision.
|
||||
# (And working on python long ints would be unbearably slow)
|
||||
configurations = configurations.astype(float)
|
||||
configurations_is_uint = False
|
||||
configurations[u] = new_val
|
||||
|
||||
self.configurations = configurations
|
||||
return configurations / total
|
||||
|
||||
|
||||
_mwu_state = _MWU(0, 0)
|
||||
|
||||
|
||||
def _get_mwu_z(U, n1, n2, t, axis=0, continuity=True):
|
||||
'''Standardized MWU statistic'''
|
||||
# Follows mannwhitneyu [2]
|
||||
mu = n1 * n2 / 2
|
||||
n = n1 + n2
|
||||
|
||||
# Tie correction according to [2], "Normal approximation and tie correction"
|
||||
# "A more computationally-efficient form..."
|
||||
tie_term = (t**3 - t).sum(axis=-1)
|
||||
s = np.sqrt(n1*n2/12 * ((n + 1) - tie_term/(n*(n-1))))
|
||||
|
||||
numerator = U - mu
|
||||
|
||||
# Continuity correction.
|
||||
# Because SF is always used to calculate the p-value, we can always
|
||||
# _subtract_ 0.5 for the continuity correction. This always increases the
|
||||
# p-value to account for the rest of the probability mass _at_ q = U.
|
||||
if continuity:
|
||||
numerator -= 0.5
|
||||
|
||||
# no problem evaluating the norm SF at an infinity
|
||||
with np.errstate(divide='ignore', invalid='ignore'):
|
||||
z = numerator / s
|
||||
return z
|
||||
|
||||
|
||||
def _mwu_input_validation(x, y, use_continuity, alternative, axis, method):
|
||||
''' Input validation and standardization for mannwhitneyu '''
|
||||
# Would use np.asarray_chkfinite, but infs are OK
|
||||
x, y = np.atleast_1d(x), np.atleast_1d(y)
|
||||
if np.isnan(x).any() or np.isnan(y).any():
|
||||
raise ValueError('`x` and `y` must not contain NaNs.')
|
||||
if np.size(x) == 0 or np.size(y) == 0:
|
||||
raise ValueError('`x` and `y` must be of nonzero size.')
|
||||
|
||||
bools = {True, False}
|
||||
if use_continuity not in bools:
|
||||
raise ValueError(f'`use_continuity` must be one of {bools}.')
|
||||
|
||||
alternatives = {"two-sided", "less", "greater"}
|
||||
alternative = alternative.lower()
|
||||
if alternative not in alternatives:
|
||||
raise ValueError(f'`alternative` must be one of {alternatives}.')
|
||||
|
||||
axis_int = int(axis)
|
||||
if axis != axis_int:
|
||||
raise ValueError('`axis` must be an integer.')
|
||||
|
||||
if not isinstance(method, stats.PermutationMethod):
|
||||
methods = {"asymptotic", "exact", "auto"}
|
||||
method = method.lower()
|
||||
if method not in methods:
|
||||
raise ValueError(f'`method` must be one of {methods}.')
|
||||
|
||||
return x, y, use_continuity, alternative, axis_int, method
|
||||
|
||||
|
||||
def _mwu_choose_method(n1, n2, ties):
|
||||
"""Choose method 'asymptotic' or 'exact' depending on input size, ties"""
|
||||
|
||||
# if both inputs are large, asymptotic is OK
|
||||
if n1 > 8 and n2 > 8:
|
||||
return "asymptotic"
|
||||
|
||||
# if there are any ties, asymptotic is preferred
|
||||
if ties:
|
||||
return "asymptotic"
|
||||
|
||||
return "exact"
|
||||
|
||||
|
||||
MannwhitneyuResult = namedtuple('MannwhitneyuResult', ('statistic', 'pvalue'))
|
||||
|
||||
|
||||
@_axis_nan_policy_factory(MannwhitneyuResult, n_samples=2)
|
||||
def mannwhitneyu(x, y, use_continuity=True, alternative="two-sided",
|
||||
axis=0, method="auto"):
|
||||
r'''Perform the Mann-Whitney U rank test on two independent samples.
|
||||
|
||||
The Mann-Whitney U test is a nonparametric test of the null hypothesis
|
||||
that the distribution underlying sample `x` is the same as the
|
||||
distribution underlying sample `y`. It is often used as a test of
|
||||
difference in location between distributions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x, y : array-like
|
||||
N-d arrays of samples. The arrays must be broadcastable except along
|
||||
the dimension given by `axis`.
|
||||
use_continuity : bool, optional
|
||||
Whether a continuity correction (1/2) should be applied.
|
||||
Default is True when `method` is ``'asymptotic'``; has no effect
|
||||
otherwise.
|
||||
alternative : {'two-sided', 'less', 'greater'}, optional
|
||||
Defines the alternative hypothesis. Default is 'two-sided'.
|
||||
Let *F(u)* and *G(u)* be the cumulative distribution functions of the
|
||||
distributions underlying `x` and `y`, respectively. Then the following
|
||||
alternative hypotheses are available:
|
||||
|
||||
* 'two-sided': the distributions are not equal, i.e. *F(u) ≠ G(u)* for
|
||||
at least one *u*.
|
||||
* 'less': the distribution underlying `x` is stochastically less
|
||||
than the distribution underlying `y`, i.e. *F(u) > G(u)* for all *u*.
|
||||
* 'greater': the distribution underlying `x` is stochastically greater
|
||||
than the distribution underlying `y`, i.e. *F(u) < G(u)* for all *u*.
|
||||
|
||||
Note that the mathematical expressions in the alternative hypotheses
|
||||
above describe the CDFs of the underlying distributions. The directions
|
||||
of the inequalities appear inconsistent with the natural language
|
||||
description at first glance, but they are not. For example, suppose
|
||||
*X* and *Y* are random variables that follow distributions with CDFs
|
||||
*F* and *G*, respectively. If *F(u) > G(u)* for all *u*, samples drawn
|
||||
from *X* tend to be less than those drawn from *Y*.
|
||||
|
||||
Under a more restrictive set of assumptions, the alternative hypotheses
|
||||
can be expressed in terms of the locations of the distributions;
|
||||
see [5] section 5.1.
|
||||
axis : int, optional
|
||||
Axis along which to perform the test. Default is 0.
|
||||
method : {'auto', 'asymptotic', 'exact'} or `PermutationMethod` instance, optional
|
||||
Selects the method used to calculate the *p*-value.
|
||||
Default is 'auto'. The following options are available.
|
||||
|
||||
* ``'asymptotic'``: compares the standardized test statistic
|
||||
against the normal distribution, correcting for ties.
|
||||
* ``'exact'``: computes the exact *p*-value by comparing the observed
|
||||
:math:`U` statistic against the exact distribution of the :math:`U`
|
||||
statistic under the null hypothesis. No correction is made for ties.
|
||||
* ``'auto'``: chooses ``'exact'`` when the size of one of the samples
|
||||
is less than or equal to 8 and there are no ties;
|
||||
chooses ``'asymptotic'`` otherwise.
|
||||
* `PermutationMethod` instance. In this case, the p-value
|
||||
is computed using `permutation_test` with the provided
|
||||
configuration options and other appropriate settings.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : MannwhitneyuResult
|
||||
An object containing attributes:
|
||||
|
||||
statistic : float
|
||||
The Mann-Whitney U statistic corresponding with sample `x`. See
|
||||
Notes for the test statistic corresponding with sample `y`.
|
||||
pvalue : float
|
||||
The associated *p*-value for the chosen `alternative`.
|
||||
|
||||
Notes
|
||||
-----
|
||||
If ``U1`` is the statistic corresponding with sample `x`, then the
|
||||
statistic corresponding with sample `y` is
|
||||
``U2 = x.shape[axis] * y.shape[axis] - U1``.
|
||||
|
||||
`mannwhitneyu` is for independent samples. For related / paired samples,
|
||||
consider `scipy.stats.wilcoxon`.
|
||||
|
||||
`method` ``'exact'`` is recommended when there are no ties and when either
|
||||
sample size is less than 8 [1]_. The implementation follows the algorithm
|
||||
reported in [3]_.
|
||||
Note that the exact method is *not* corrected for ties, but
|
||||
`mannwhitneyu` will not raise errors or warnings if there are ties in the
|
||||
data. If there are ties and either samples is small (fewer than ~10
|
||||
observations), consider passing an instance of `PermutationMethod`
|
||||
as the `method` to perform a permutation test.
|
||||
|
||||
The Mann-Whitney U test is a non-parametric version of the t-test for
|
||||
independent samples. When the means of samples from the populations
|
||||
are normally distributed, consider `scipy.stats.ttest_ind`.
|
||||
|
||||
See Also
|
||||
--------
|
||||
scipy.stats.wilcoxon, scipy.stats.ranksums, scipy.stats.ttest_ind
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] H.B. Mann and D.R. Whitney, "On a test of whether one of two random
|
||||
variables is stochastically larger than the other", The Annals of
|
||||
Mathematical Statistics, Vol. 18, pp. 50-60, 1947.
|
||||
.. [2] Mann-Whitney U Test, Wikipedia,
|
||||
http://en.wikipedia.org/wiki/Mann-Whitney_U_test
|
||||
.. [3] Andreas Löffler,
|
||||
"Über eine Partition der nat. Zahlen und ihr Anwendung beim U-Test",
|
||||
Wiss. Z. Univ. Halle, XXXII'83 pp. 87-89.
|
||||
.. [4] Rosie Shier, "Statistics: 2.3 The Mann-Whitney U Test", Mathematics
|
||||
Learning Support Centre, 2004.
|
||||
.. [5] Michael P. Fay and Michael A. Proschan. "Wilcoxon-Mann-Whitney
|
||||
or t-test? On assumptions for hypothesis tests and multiple \
|
||||
interpretations of decision rules." Statistics surveys, Vol. 4, pp.
|
||||
1-39, 2010. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2857732/
|
||||
|
||||
Examples
|
||||
--------
|
||||
We follow the example from [4]_: nine randomly sampled young adults were
|
||||
diagnosed with type II diabetes at the ages below.
|
||||
|
||||
>>> males = [19, 22, 16, 29, 24]
|
||||
>>> females = [20, 11, 17, 12]
|
||||
|
||||
We use the Mann-Whitney U test to assess whether there is a statistically
|
||||
significant difference in the diagnosis age of males and females.
|
||||
The null hypothesis is that the distribution of male diagnosis ages is
|
||||
the same as the distribution of female diagnosis ages. We decide
|
||||
that a confidence level of 95% is required to reject the null hypothesis
|
||||
in favor of the alternative that the distributions are different.
|
||||
Since the number of samples is very small and there are no ties in the
|
||||
data, we can compare the observed test statistic against the *exact*
|
||||
distribution of the test statistic under the null hypothesis.
|
||||
|
||||
>>> from scipy.stats import mannwhitneyu
|
||||
>>> U1, p = mannwhitneyu(males, females, method="exact")
|
||||
>>> print(U1)
|
||||
17.0
|
||||
|
||||
`mannwhitneyu` always reports the statistic associated with the first
|
||||
sample, which, in this case, is males. This agrees with :math:`U_M = 17`
|
||||
reported in [4]_. The statistic associated with the second statistic
|
||||
can be calculated:
|
||||
|
||||
>>> nx, ny = len(males), len(females)
|
||||
>>> U2 = nx*ny - U1
|
||||
>>> print(U2)
|
||||
3.0
|
||||
|
||||
This agrees with :math:`U_F = 3` reported in [4]_. The two-sided
|
||||
*p*-value can be calculated from either statistic, and the value produced
|
||||
by `mannwhitneyu` agrees with :math:`p = 0.11` reported in [4]_.
|
||||
|
||||
>>> print(p)
|
||||
0.1111111111111111
|
||||
|
||||
The exact distribution of the test statistic is asymptotically normal, so
|
||||
the example continues by comparing the exact *p*-value against the
|
||||
*p*-value produced using the normal approximation.
|
||||
|
||||
>>> _, pnorm = mannwhitneyu(males, females, method="asymptotic")
|
||||
>>> print(pnorm)
|
||||
0.11134688653314041
|
||||
|
||||
Here `mannwhitneyu`'s reported *p*-value appears to conflict with the
|
||||
value :math:`p = 0.09` given in [4]_. The reason is that [4]_
|
||||
does not apply the continuity correction performed by `mannwhitneyu`;
|
||||
`mannwhitneyu` reduces the distance between the test statistic and the
|
||||
mean :math:`\mu = n_x n_y / 2` by 0.5 to correct for the fact that the
|
||||
discrete statistic is being compared against a continuous distribution.
|
||||
Here, the :math:`U` statistic used is less than the mean, so we reduce
|
||||
the distance by adding 0.5 in the numerator.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats import norm
|
||||
>>> U = min(U1, U2)
|
||||
>>> N = nx + ny
|
||||
>>> z = (U - nx*ny/2 + 0.5) / np.sqrt(nx*ny * (N + 1)/ 12)
|
||||
>>> p = 2 * norm.cdf(z) # use CDF to get p-value from smaller statistic
|
||||
>>> print(p)
|
||||
0.11134688653314041
|
||||
|
||||
If desired, we can disable the continuity correction to get a result
|
||||
that agrees with that reported in [4]_.
|
||||
|
||||
>>> _, pnorm = mannwhitneyu(males, females, use_continuity=False,
|
||||
... method="asymptotic")
|
||||
>>> print(pnorm)
|
||||
0.0864107329737
|
||||
|
||||
Regardless of whether we perform an exact or asymptotic test, the
|
||||
probability of the test statistic being as extreme or more extreme by
|
||||
chance exceeds 5%, so we do not consider the results statistically
|
||||
significant.
|
||||
|
||||
Suppose that, before seeing the data, we had hypothesized that females
|
||||
would tend to be diagnosed at a younger age than males.
|
||||
In that case, it would be natural to provide the female ages as the
|
||||
first input, and we would have performed a one-sided test using
|
||||
``alternative = 'less'``: females are diagnosed at an age that is
|
||||
stochastically less than that of males.
|
||||
|
||||
>>> res = mannwhitneyu(females, males, alternative="less", method="exact")
|
||||
>>> print(res)
|
||||
MannwhitneyuResult(statistic=3.0, pvalue=0.05555555555555555)
|
||||
|
||||
Again, the probability of getting a sufficiently low value of the
|
||||
test statistic by chance under the null hypothesis is greater than 5%,
|
||||
so we do not reject the null hypothesis in favor of our alternative.
|
||||
|
||||
If it is reasonable to assume that the means of samples from the
|
||||
populations are normally distributed, we could have used a t-test to
|
||||
perform the analysis.
|
||||
|
||||
>>> from scipy.stats import ttest_ind
|
||||
>>> res = ttest_ind(females, males, alternative="less")
|
||||
>>> print(res)
|
||||
TtestResult(statistic=-2.239334696520584,
|
||||
pvalue=0.030068441095757924,
|
||||
df=7.0)
|
||||
|
||||
Under this assumption, the *p*-value would be low enough to reject the
|
||||
null hypothesis in favor of the alternative.
|
||||
|
||||
'''
|
||||
|
||||
x, y, use_continuity, alternative, axis_int, method = (
|
||||
_mwu_input_validation(x, y, use_continuity, alternative, axis, method))
|
||||
|
||||
x, y, xy = _broadcast_concatenate(x, y, axis)
|
||||
|
||||
n1, n2 = x.shape[-1], y.shape[-1]
|
||||
|
||||
# Follows [2]
|
||||
ranks, t = _rankdata(xy, 'average', return_ties=True) # method 2, step 1
|
||||
R1 = ranks[..., :n1].sum(axis=-1) # method 2, step 2
|
||||
U1 = R1 - n1*(n1+1)/2 # method 2, step 3
|
||||
U2 = n1 * n2 - U1 # as U1 + U2 = n1 * n2
|
||||
|
||||
if alternative == "greater":
|
||||
U, f = U1, 1 # U is the statistic to use for p-value, f is a factor
|
||||
elif alternative == "less":
|
||||
U, f = U2, 1 # Due to symmetry, use SF of U2 rather than CDF of U1
|
||||
else:
|
||||
U, f = np.maximum(U1, U2), 2 # multiply SF by two for two-sided test
|
||||
|
||||
if method == "auto":
|
||||
method = _mwu_choose_method(n1, n2, np.any(t > 1))
|
||||
|
||||
if method == "exact":
|
||||
_mwu_state.set_shapes(n1, n2)
|
||||
p = _mwu_state.sf(U.astype(int))
|
||||
elif method == "asymptotic":
|
||||
z = _get_mwu_z(U, n1, n2, t, continuity=use_continuity)
|
||||
p = stats.norm.sf(z)
|
||||
else: # `PermutationMethod` instance (already validated)
|
||||
def statistic(x, y, axis):
|
||||
return mannwhitneyu(x, y, use_continuity=use_continuity,
|
||||
alternative=alternative, axis=axis,
|
||||
method="asymptotic").statistic
|
||||
|
||||
res = stats.permutation_test((x, y), statistic, axis=axis,
|
||||
**method._asdict(), alternative=alternative)
|
||||
p = res.pvalue
|
||||
f = 1
|
||||
|
||||
p *= f
|
||||
|
||||
# Ensure that test statistic is not greater than 1
|
||||
# This could happen for exact test when U = m*n/2
|
||||
p = np.clip(p, 0, 1)
|
||||
|
||||
return MannwhitneyuResult(U1, p)
|
||||
550
venv/lib/python3.12/site-packages/scipy/stats/_mgc.py
Normal file
550
venv/lib/python3.12/site-packages/scipy/stats/_mgc.py
Normal file
@ -0,0 +1,550 @@
|
||||
import warnings
|
||||
import numpy as np
|
||||
|
||||
from scipy._lib._util import check_random_state, MapWrapper, rng_integers, _contains_nan
|
||||
from scipy._lib._bunch import _make_tuple_bunch
|
||||
from scipy.spatial.distance import cdist
|
||||
from scipy.ndimage import _measurements
|
||||
|
||||
from ._stats import _local_correlations # type: ignore[import-not-found]
|
||||
from . import distributions
|
||||
|
||||
__all__ = ['multiscale_graphcorr']
|
||||
|
||||
# FROM MGCPY: https://github.com/neurodata/mgcpy
|
||||
|
||||
|
||||
class _ParallelP:
|
||||
"""Helper function to calculate parallel p-value."""
|
||||
|
||||
def __init__(self, x, y, random_states):
|
||||
self.x = x
|
||||
self.y = y
|
||||
self.random_states = random_states
|
||||
|
||||
def __call__(self, index):
|
||||
order = self.random_states[index].permutation(self.y.shape[0])
|
||||
permy = self.y[order][:, order]
|
||||
|
||||
# calculate permuted stats, store in null distribution
|
||||
perm_stat = _mgc_stat(self.x, permy)[0]
|
||||
|
||||
return perm_stat
|
||||
|
||||
|
||||
def _perm_test(x, y, stat, reps=1000, workers=-1, random_state=None):
|
||||
r"""Helper function that calculates the p-value. See below for uses.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x, y : ndarray
|
||||
`x` and `y` have shapes `(n, p)` and `(n, q)`.
|
||||
stat : float
|
||||
The sample test statistic.
|
||||
reps : int, optional
|
||||
The number of replications used to estimate the null when using the
|
||||
permutation test. The default is 1000 replications.
|
||||
workers : int or map-like callable, optional
|
||||
If `workers` is an int the population is subdivided into `workers`
|
||||
sections and evaluated in parallel (uses
|
||||
`multiprocessing.Pool <multiprocessing>`). Supply `-1` to use all cores
|
||||
available to the Process. Alternatively supply a map-like callable,
|
||||
such as `multiprocessing.Pool.map` for evaluating the population in
|
||||
parallel. This evaluation is carried out as `workers(func, iterable)`.
|
||||
Requires that `func` be pickleable.
|
||||
random_state : {None, int, `numpy.random.Generator`,
|
||||
`numpy.random.RandomState`}, optional
|
||||
|
||||
If `seed` is None (or `np.random`), the `numpy.random.RandomState`
|
||||
singleton is used.
|
||||
If `seed` is an int, a new ``RandomState`` instance is used,
|
||||
seeded with `seed`.
|
||||
If `seed` is already a ``Generator`` or ``RandomState`` instance then
|
||||
that instance is used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pvalue : float
|
||||
The sample test p-value.
|
||||
null_dist : list
|
||||
The approximated null distribution.
|
||||
|
||||
"""
|
||||
# generate seeds for each rep (change to new parallel random number
|
||||
# capabilities in numpy >= 1.17+)
|
||||
random_state = check_random_state(random_state)
|
||||
random_states = [np.random.RandomState(rng_integers(random_state, 1 << 32,
|
||||
size=4, dtype=np.uint32)) for _ in range(reps)]
|
||||
|
||||
# parallelizes with specified workers over number of reps and set seeds
|
||||
parallelp = _ParallelP(x=x, y=y, random_states=random_states)
|
||||
with MapWrapper(workers) as mapwrapper:
|
||||
null_dist = np.array(list(mapwrapper(parallelp, range(reps))))
|
||||
|
||||
# calculate p-value and significant permutation map through list
|
||||
pvalue = (1 + (null_dist >= stat).sum()) / (1 + reps)
|
||||
|
||||
return pvalue, null_dist
|
||||
|
||||
|
||||
def _euclidean_dist(x):
|
||||
return cdist(x, x)
|
||||
|
||||
|
||||
MGCResult = _make_tuple_bunch('MGCResult',
|
||||
['statistic', 'pvalue', 'mgc_dict'], [])
|
||||
|
||||
|
||||
def multiscale_graphcorr(x, y, compute_distance=_euclidean_dist, reps=1000,
|
||||
workers=1, is_twosamp=False, random_state=None):
|
||||
r"""Computes the Multiscale Graph Correlation (MGC) test statistic.
|
||||
|
||||
Specifically, for each point, MGC finds the :math:`k`-nearest neighbors for
|
||||
one property (e.g. cloud density), and the :math:`l`-nearest neighbors for
|
||||
the other property (e.g. grass wetness) [1]_. This pair :math:`(k, l)` is
|
||||
called the "scale". A priori, however, it is not know which scales will be
|
||||
most informative. So, MGC computes all distance pairs, and then efficiently
|
||||
computes the distance correlations for all scales. The local correlations
|
||||
illustrate which scales are relatively informative about the relationship.
|
||||
The key, therefore, to successfully discover and decipher relationships
|
||||
between disparate data modalities is to adaptively determine which scales
|
||||
are the most informative, and the geometric implication for the most
|
||||
informative scales. Doing so not only provides an estimate of whether the
|
||||
modalities are related, but also provides insight into how the
|
||||
determination was made. This is especially important in high-dimensional
|
||||
data, where simple visualizations do not reveal relationships to the
|
||||
unaided human eye. Characterizations of this implementation in particular
|
||||
have been derived from and benchmarked within in [2]_.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x, y : ndarray
|
||||
If ``x`` and ``y`` have shapes ``(n, p)`` and ``(n, q)`` where `n` is
|
||||
the number of samples and `p` and `q` are the number of dimensions,
|
||||
then the MGC independence test will be run. Alternatively, ``x`` and
|
||||
``y`` can have shapes ``(n, n)`` if they are distance or similarity
|
||||
matrices, and ``compute_distance`` must be sent to ``None``. If ``x``
|
||||
and ``y`` have shapes ``(n, p)`` and ``(m, p)``, an unpaired
|
||||
two-sample MGC test will be run.
|
||||
compute_distance : callable, optional
|
||||
A function that computes the distance or similarity among the samples
|
||||
within each data matrix. Set to ``None`` if ``x`` and ``y`` are
|
||||
already distance matrices. The default uses the euclidean norm metric.
|
||||
If you are calling a custom function, either create the distance
|
||||
matrix before-hand or create a function of the form
|
||||
``compute_distance(x)`` where `x` is the data matrix for which
|
||||
pairwise distances are calculated.
|
||||
reps : int, optional
|
||||
The number of replications used to estimate the null when using the
|
||||
permutation test. The default is ``1000``.
|
||||
workers : int or map-like callable, optional
|
||||
If ``workers`` is an int the population is subdivided into ``workers``
|
||||
sections and evaluated in parallel (uses ``multiprocessing.Pool
|
||||
<multiprocessing>``). Supply ``-1`` to use all cores available to the
|
||||
Process. Alternatively supply a map-like callable, such as
|
||||
``multiprocessing.Pool.map`` for evaluating the p-value in parallel.
|
||||
This evaluation is carried out as ``workers(func, iterable)``.
|
||||
Requires that `func` be pickleable. The default is ``1``.
|
||||
is_twosamp : bool, optional
|
||||
If `True`, a two sample test will be run. If ``x`` and ``y`` have
|
||||
shapes ``(n, p)`` and ``(m, p)``, this optional will be overridden and
|
||||
set to ``True``. Set to ``True`` if ``x`` and ``y`` both have shapes
|
||||
``(n, p)`` and a two sample test is desired. The default is ``False``.
|
||||
Note that this will not run if inputs are distance matrices.
|
||||
random_state : {None, int, `numpy.random.Generator`,
|
||||
`numpy.random.RandomState`}, optional
|
||||
|
||||
If `seed` is None (or `np.random`), the `numpy.random.RandomState`
|
||||
singleton is used.
|
||||
If `seed` is an int, a new ``RandomState`` instance is used,
|
||||
seeded with `seed`.
|
||||
If `seed` is already a ``Generator`` or ``RandomState`` instance then
|
||||
that instance is used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : MGCResult
|
||||
An object containing attributes:
|
||||
|
||||
statistic : float
|
||||
The sample MGC test statistic within `[-1, 1]`.
|
||||
pvalue : float
|
||||
The p-value obtained via permutation.
|
||||
mgc_dict : dict
|
||||
Contains additional useful results:
|
||||
|
||||
- mgc_map : ndarray
|
||||
A 2D representation of the latent geometry of the
|
||||
relationship.
|
||||
- opt_scale : (int, int)
|
||||
The estimated optimal scale as a `(x, y)` pair.
|
||||
- null_dist : list
|
||||
The null distribution derived from the permuted matrices.
|
||||
|
||||
See Also
|
||||
--------
|
||||
pearsonr : Pearson correlation coefficient and p-value for testing
|
||||
non-correlation.
|
||||
kendalltau : Calculates Kendall's tau.
|
||||
spearmanr : Calculates a Spearman rank-order correlation coefficient.
|
||||
|
||||
Notes
|
||||
-----
|
||||
A description of the process of MGC and applications on neuroscience data
|
||||
can be found in [1]_. It is performed using the following steps:
|
||||
|
||||
#. Two distance matrices :math:`D^X` and :math:`D^Y` are computed and
|
||||
modified to be mean zero columnwise. This results in two
|
||||
:math:`n \times n` distance matrices :math:`A` and :math:`B` (the
|
||||
centering and unbiased modification) [3]_.
|
||||
|
||||
#. For all values :math:`k` and :math:`l` from :math:`1, ..., n`,
|
||||
|
||||
* The :math:`k`-nearest neighbor and :math:`l`-nearest neighbor graphs
|
||||
are calculated for each property. Here, :math:`G_k (i, j)` indicates
|
||||
the :math:`k`-smallest values of the :math:`i`-th row of :math:`A`
|
||||
and :math:`H_l (i, j)` indicates the :math:`l` smallested values of
|
||||
the :math:`i`-th row of :math:`B`
|
||||
|
||||
* Let :math:`\circ` denotes the entry-wise matrix product, then local
|
||||
correlations are summed and normalized using the following statistic:
|
||||
|
||||
.. math::
|
||||
|
||||
c^{kl} = \frac{\sum_{ij} A G_k B H_l}
|
||||
{\sqrt{\sum_{ij} A^2 G_k \times \sum_{ij} B^2 H_l}}
|
||||
|
||||
#. The MGC test statistic is the smoothed optimal local correlation of
|
||||
:math:`\{ c^{kl} \}`. Denote the smoothing operation as :math:`R(\cdot)`
|
||||
(which essentially set all isolated large correlations) as 0 and
|
||||
connected large correlations the same as before, see [3]_.) MGC is,
|
||||
|
||||
.. math::
|
||||
|
||||
MGC_n (x, y) = \max_{(k, l)} R \left(c^{kl} \left( x_n, y_n \right)
|
||||
\right)
|
||||
|
||||
The test statistic returns a value between :math:`(-1, 1)` since it is
|
||||
normalized.
|
||||
|
||||
The p-value returned is calculated using a permutation test. This process
|
||||
is completed by first randomly permuting :math:`y` to estimate the null
|
||||
distribution and then calculating the probability of observing a test
|
||||
statistic, under the null, at least as extreme as the observed test
|
||||
statistic.
|
||||
|
||||
MGC requires at least 5 samples to run with reliable results. It can also
|
||||
handle high-dimensional data sets.
|
||||
In addition, by manipulating the input data matrices, the two-sample
|
||||
testing problem can be reduced to the independence testing problem [4]_.
|
||||
Given sample data :math:`U` and :math:`V` of sizes :math:`p \times n`
|
||||
:math:`p \times m`, data matrix :math:`X` and :math:`Y` can be created as
|
||||
follows:
|
||||
|
||||
.. math::
|
||||
|
||||
X = [U | V] \in \mathcal{R}^{p \times (n + m)}
|
||||
Y = [0_{1 \times n} | 1_{1 \times m}] \in \mathcal{R}^{(n + m)}
|
||||
|
||||
Then, the MGC statistic can be calculated as normal. This methodology can
|
||||
be extended to similar tests such as distance correlation [4]_.
|
||||
|
||||
.. versionadded:: 1.4.0
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Vogelstein, J. T., Bridgeford, E. W., Wang, Q., Priebe, C. E.,
|
||||
Maggioni, M., & Shen, C. (2019). Discovering and deciphering
|
||||
relationships across disparate data modalities. ELife.
|
||||
.. [2] Panda, S., Palaniappan, S., Xiong, J., Swaminathan, A.,
|
||||
Ramachandran, S., Bridgeford, E. W., ... Vogelstein, J. T. (2019).
|
||||
mgcpy: A Comprehensive High Dimensional Independence Testing Python
|
||||
Package. :arXiv:`1907.02088`
|
||||
.. [3] Shen, C., Priebe, C.E., & Vogelstein, J. T. (2019). From distance
|
||||
correlation to multiscale graph correlation. Journal of the American
|
||||
Statistical Association.
|
||||
.. [4] Shen, C. & Vogelstein, J. T. (2018). The Exact Equivalence of
|
||||
Distance and Kernel Methods for Hypothesis Testing.
|
||||
:arXiv:`1806.05514`
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats import multiscale_graphcorr
|
||||
>>> x = np.arange(100)
|
||||
>>> y = x
|
||||
>>> res = multiscale_graphcorr(x, y)
|
||||
>>> res.statistic, res.pvalue
|
||||
(1.0, 0.001)
|
||||
|
||||
To run an unpaired two-sample test,
|
||||
|
||||
>>> x = np.arange(100)
|
||||
>>> y = np.arange(79)
|
||||
>>> res = multiscale_graphcorr(x, y)
|
||||
>>> res.statistic, res.pvalue # doctest: +SKIP
|
||||
(0.033258146255703246, 0.023)
|
||||
|
||||
or, if shape of the inputs are the same,
|
||||
|
||||
>>> x = np.arange(100)
|
||||
>>> y = x
|
||||
>>> res = multiscale_graphcorr(x, y, is_twosamp=True)
|
||||
>>> res.statistic, res.pvalue # doctest: +SKIP
|
||||
(-0.008021809890200488, 1.0)
|
||||
|
||||
"""
|
||||
if not isinstance(x, np.ndarray) or not isinstance(y, np.ndarray):
|
||||
raise ValueError("x and y must be ndarrays")
|
||||
|
||||
# convert arrays of type (n,) to (n, 1)
|
||||
if x.ndim == 1:
|
||||
x = x[:, np.newaxis]
|
||||
elif x.ndim != 2:
|
||||
raise ValueError(f"Expected a 2-D array `x`, found shape {x.shape}")
|
||||
if y.ndim == 1:
|
||||
y = y[:, np.newaxis]
|
||||
elif y.ndim != 2:
|
||||
raise ValueError(f"Expected a 2-D array `y`, found shape {y.shape}")
|
||||
|
||||
nx, px = x.shape
|
||||
ny, py = y.shape
|
||||
|
||||
# check for NaNs
|
||||
_contains_nan(x, nan_policy='raise')
|
||||
_contains_nan(y, nan_policy='raise')
|
||||
|
||||
# check for positive or negative infinity and raise error
|
||||
if np.sum(np.isinf(x)) > 0 or np.sum(np.isinf(y)) > 0:
|
||||
raise ValueError("Inputs contain infinities")
|
||||
|
||||
if nx != ny:
|
||||
if px == py:
|
||||
# reshape x and y for two sample testing
|
||||
is_twosamp = True
|
||||
else:
|
||||
raise ValueError("Shape mismatch, x and y must have shape [n, p] "
|
||||
"and [n, q] or have shape [n, p] and [m, p].")
|
||||
|
||||
if nx < 5 or ny < 5:
|
||||
raise ValueError("MGC requires at least 5 samples to give reasonable "
|
||||
"results.")
|
||||
|
||||
# convert x and y to float
|
||||
x = x.astype(np.float64)
|
||||
y = y.astype(np.float64)
|
||||
|
||||
# check if compute_distance_matrix if a callable()
|
||||
if not callable(compute_distance) and compute_distance is not None:
|
||||
raise ValueError("Compute_distance must be a function.")
|
||||
|
||||
# check if number of reps exists, integer, or > 0 (if under 1000 raises
|
||||
# warning)
|
||||
if not isinstance(reps, int) or reps < 0:
|
||||
raise ValueError("Number of reps must be an integer greater than 0.")
|
||||
elif reps < 1000:
|
||||
msg = ("The number of replications is low (under 1000), and p-value "
|
||||
"calculations may be unreliable. Use the p-value result, with "
|
||||
"caution!")
|
||||
warnings.warn(msg, RuntimeWarning, stacklevel=2)
|
||||
|
||||
if is_twosamp:
|
||||
if compute_distance is None:
|
||||
raise ValueError("Cannot run if inputs are distance matrices")
|
||||
x, y = _two_sample_transform(x, y)
|
||||
|
||||
if compute_distance is not None:
|
||||
# compute distance matrices for x and y
|
||||
x = compute_distance(x)
|
||||
y = compute_distance(y)
|
||||
|
||||
# calculate MGC stat
|
||||
stat, stat_dict = _mgc_stat(x, y)
|
||||
stat_mgc_map = stat_dict["stat_mgc_map"]
|
||||
opt_scale = stat_dict["opt_scale"]
|
||||
|
||||
# calculate permutation MGC p-value
|
||||
pvalue, null_dist = _perm_test(x, y, stat, reps=reps, workers=workers,
|
||||
random_state=random_state)
|
||||
|
||||
# save all stats (other than stat/p-value) in dictionary
|
||||
mgc_dict = {"mgc_map": stat_mgc_map,
|
||||
"opt_scale": opt_scale,
|
||||
"null_dist": null_dist}
|
||||
|
||||
# create result object with alias for backward compatibility
|
||||
res = MGCResult(stat, pvalue, mgc_dict)
|
||||
res.stat = stat
|
||||
return res
|
||||
|
||||
|
||||
def _mgc_stat(distx, disty):
|
||||
r"""Helper function that calculates the MGC stat. See above for use.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
distx, disty : ndarray
|
||||
`distx` and `disty` have shapes `(n, p)` and `(n, q)` or
|
||||
`(n, n)` and `(n, n)`
|
||||
if distance matrices.
|
||||
|
||||
Returns
|
||||
-------
|
||||
stat : float
|
||||
The sample MGC test statistic within `[-1, 1]`.
|
||||
stat_dict : dict
|
||||
Contains additional useful additional returns containing the following
|
||||
keys:
|
||||
|
||||
- stat_mgc_map : ndarray
|
||||
MGC-map of the statistics.
|
||||
- opt_scale : (float, float)
|
||||
The estimated optimal scale as a `(x, y)` pair.
|
||||
|
||||
"""
|
||||
# calculate MGC map and optimal scale
|
||||
stat_mgc_map = _local_correlations(distx, disty, global_corr='mgc')
|
||||
|
||||
n, m = stat_mgc_map.shape
|
||||
if m == 1 or n == 1:
|
||||
# the global scale at is the statistic calculated at maximial nearest
|
||||
# neighbors. There is not enough local scale to search over, so
|
||||
# default to global scale
|
||||
stat = stat_mgc_map[m - 1][n - 1]
|
||||
opt_scale = m * n
|
||||
else:
|
||||
samp_size = len(distx) - 1
|
||||
|
||||
# threshold to find connected region of significant local correlations
|
||||
sig_connect = _threshold_mgc_map(stat_mgc_map, samp_size)
|
||||
|
||||
# maximum within the significant region
|
||||
stat, opt_scale = _smooth_mgc_map(sig_connect, stat_mgc_map)
|
||||
|
||||
stat_dict = {"stat_mgc_map": stat_mgc_map,
|
||||
"opt_scale": opt_scale}
|
||||
|
||||
return stat, stat_dict
|
||||
|
||||
|
||||
def _threshold_mgc_map(stat_mgc_map, samp_size):
|
||||
r"""
|
||||
Finds a connected region of significance in the MGC-map by thresholding.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
stat_mgc_map : ndarray
|
||||
All local correlations within `[-1,1]`.
|
||||
samp_size : int
|
||||
The sample size of original data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
sig_connect : ndarray
|
||||
A binary matrix with 1's indicating the significant region.
|
||||
|
||||
"""
|
||||
m, n = stat_mgc_map.shape
|
||||
|
||||
# 0.02 is simply an empirical threshold, this can be set to 0.01 or 0.05
|
||||
# with varying levels of performance. Threshold is based on a beta
|
||||
# approximation.
|
||||
per_sig = 1 - (0.02 / samp_size) # Percentile to consider as significant
|
||||
threshold = samp_size * (samp_size - 3)/4 - 1/2 # Beta approximation
|
||||
threshold = distributions.beta.ppf(per_sig, threshold, threshold) * 2 - 1
|
||||
|
||||
# the global scale at is the statistic calculated at maximial nearest
|
||||
# neighbors. Threshold is the maximum on the global and local scales
|
||||
threshold = max(threshold, stat_mgc_map[m - 1][n - 1])
|
||||
|
||||
# find the largest connected component of significant correlations
|
||||
sig_connect = stat_mgc_map > threshold
|
||||
if np.sum(sig_connect) > 0:
|
||||
sig_connect, _ = _measurements.label(sig_connect)
|
||||
_, label_counts = np.unique(sig_connect, return_counts=True)
|
||||
|
||||
# skip the first element in label_counts, as it is count(zeros)
|
||||
max_label = np.argmax(label_counts[1:]) + 1
|
||||
sig_connect = sig_connect == max_label
|
||||
else:
|
||||
sig_connect = np.array([[False]])
|
||||
|
||||
return sig_connect
|
||||
|
||||
|
||||
def _smooth_mgc_map(sig_connect, stat_mgc_map):
|
||||
"""Finds the smoothed maximal within the significant region R.
|
||||
|
||||
If area of R is too small it returns the last local correlation. Otherwise,
|
||||
returns the maximum within significant_connected_region.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sig_connect : ndarray
|
||||
A binary matrix with 1's indicating the significant region.
|
||||
stat_mgc_map : ndarray
|
||||
All local correlations within `[-1, 1]`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
stat : float
|
||||
The sample MGC statistic within `[-1, 1]`.
|
||||
opt_scale: (float, float)
|
||||
The estimated optimal scale as an `(x, y)` pair.
|
||||
|
||||
"""
|
||||
m, n = stat_mgc_map.shape
|
||||
|
||||
# the global scale at is the statistic calculated at maximial nearest
|
||||
# neighbors. By default, statistic and optimal scale are global.
|
||||
stat = stat_mgc_map[m - 1][n - 1]
|
||||
opt_scale = [m, n]
|
||||
|
||||
if np.linalg.norm(sig_connect) != 0:
|
||||
# proceed only when the connected region's area is sufficiently large
|
||||
# 0.02 is simply an empirical threshold, this can be set to 0.01 or 0.05
|
||||
# with varying levels of performance
|
||||
if np.sum(sig_connect) >= np.ceil(0.02 * max(m, n)) * min(m, n):
|
||||
max_corr = max(stat_mgc_map[sig_connect])
|
||||
|
||||
# find all scales within significant_connected_region that maximize
|
||||
# the local correlation
|
||||
max_corr_index = np.where((stat_mgc_map >= max_corr) & sig_connect)
|
||||
|
||||
if max_corr >= stat:
|
||||
stat = max_corr
|
||||
|
||||
k, l = max_corr_index
|
||||
one_d_indices = k * n + l # 2D to 1D indexing
|
||||
k = np.max(one_d_indices) // n
|
||||
l = np.max(one_d_indices) % n
|
||||
opt_scale = [k+1, l+1] # adding 1s to match R indexing
|
||||
|
||||
return stat, opt_scale
|
||||
|
||||
|
||||
def _two_sample_transform(u, v):
|
||||
"""Helper function that concatenates x and y for two sample MGC stat.
|
||||
|
||||
See above for use.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
u, v : ndarray
|
||||
`u` and `v` have shapes `(n, p)` and `(m, p)`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
x : ndarray
|
||||
Concatenate `u` and `v` along the `axis = 0`. `x` thus has shape
|
||||
`(2n, p)`.
|
||||
y : ndarray
|
||||
Label matrix for `x` where 0 refers to samples that comes from `u` and
|
||||
1 refers to samples that come from `v`. `y` thus has shape `(2n, 1)`.
|
||||
|
||||
"""
|
||||
nx = u.shape[0]
|
||||
ny = v.shape[0]
|
||||
x = np.concatenate([u, v], axis=0)
|
||||
y = np.concatenate([np.zeros(nx), np.ones(ny)], axis=0).reshape(-1, 1)
|
||||
return x, y
|
||||
4997
venv/lib/python3.12/site-packages/scipy/stats/_morestats.py
Normal file
4997
venv/lib/python3.12/site-packages/scipy/stats/_morestats.py
Normal file
File diff suppressed because it is too large
Load Diff
3663
venv/lib/python3.12/site-packages/scipy/stats/_mstats_basic.py
Normal file
3663
venv/lib/python3.12/site-packages/scipy/stats/_mstats_basic.py
Normal file
File diff suppressed because it is too large
Load Diff
521
venv/lib/python3.12/site-packages/scipy/stats/_mstats_extras.py
Normal file
521
venv/lib/python3.12/site-packages/scipy/stats/_mstats_extras.py
Normal file
@ -0,0 +1,521 @@
|
||||
"""
|
||||
Additional statistics functions with support for masked arrays.
|
||||
|
||||
"""
|
||||
|
||||
# Original author (2007): Pierre GF Gerard-Marchant
|
||||
|
||||
|
||||
__all__ = ['compare_medians_ms',
|
||||
'hdquantiles', 'hdmedian', 'hdquantiles_sd',
|
||||
'idealfourths',
|
||||
'median_cihs','mjci','mquantiles_cimj',
|
||||
'rsh',
|
||||
'trimmed_mean_ci',]
|
||||
|
||||
|
||||
import numpy as np
|
||||
from numpy import float64, ndarray
|
||||
|
||||
import numpy.ma as ma
|
||||
from numpy.ma import MaskedArray
|
||||
|
||||
from . import _mstats_basic as mstats
|
||||
|
||||
from scipy.stats.distributions import norm, beta, t, binom
|
||||
|
||||
|
||||
def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,):
|
||||
"""
|
||||
Computes quantile estimates with the Harrell-Davis method.
|
||||
|
||||
The quantile estimates are calculated as a weighted linear combination
|
||||
of order statistics.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Data array.
|
||||
prob : sequence, optional
|
||||
Sequence of probabilities at which to compute the quantiles.
|
||||
axis : int or None, optional
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
var : bool, optional
|
||||
Whether to return the variance of the estimate.
|
||||
|
||||
Returns
|
||||
-------
|
||||
hdquantiles : MaskedArray
|
||||
A (p,) array of quantiles (if `var` is False), or a (2,p) array of
|
||||
quantiles and variances (if `var` is True), where ``p`` is the
|
||||
number of quantiles.
|
||||
|
||||
See Also
|
||||
--------
|
||||
hdquantiles_sd
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats.mstats import hdquantiles
|
||||
>>>
|
||||
>>> # Sample data
|
||||
>>> data = np.array([1.2, 2.5, 3.7, 4.0, 5.1, 6.3, 7.0, 8.2, 9.4])
|
||||
>>>
|
||||
>>> # Probabilities at which to compute quantiles
|
||||
>>> probabilities = [0.25, 0.5, 0.75]
|
||||
>>>
|
||||
>>> # Compute Harrell-Davis quantile estimates
|
||||
>>> quantile_estimates = hdquantiles(data, prob=probabilities)
|
||||
>>>
|
||||
>>> # Display the quantile estimates
|
||||
>>> for i, quantile in enumerate(probabilities):
|
||||
... print(f"{int(quantile * 100)}th percentile: {quantile_estimates[i]}")
|
||||
25th percentile: 3.1505820231763066 # may vary
|
||||
50th percentile: 5.194344084883956
|
||||
75th percentile: 7.430626414674935
|
||||
|
||||
"""
|
||||
def _hd_1D(data,prob,var):
|
||||
"Computes the HD quantiles for a 1D array. Returns nan for invalid data."
|
||||
xsorted = np.squeeze(np.sort(data.compressed().view(ndarray)))
|
||||
# Don't use length here, in case we have a numpy scalar
|
||||
n = xsorted.size
|
||||
|
||||
hd = np.empty((2,len(prob)), float64)
|
||||
if n < 2:
|
||||
hd.flat = np.nan
|
||||
if var:
|
||||
return hd
|
||||
return hd[0]
|
||||
|
||||
v = np.arange(n+1) / float(n)
|
||||
betacdf = beta.cdf
|
||||
for (i,p) in enumerate(prob):
|
||||
_w = betacdf(v, (n+1)*p, (n+1)*(1-p))
|
||||
w = _w[1:] - _w[:-1]
|
||||
hd_mean = np.dot(w, xsorted)
|
||||
hd[0,i] = hd_mean
|
||||
#
|
||||
hd[1,i] = np.dot(w, (xsorted-hd_mean)**2)
|
||||
#
|
||||
hd[0, prob == 0] = xsorted[0]
|
||||
hd[0, prob == 1] = xsorted[-1]
|
||||
if var:
|
||||
hd[1, prob == 0] = hd[1, prob == 1] = np.nan
|
||||
return hd
|
||||
return hd[0]
|
||||
# Initialization & checks
|
||||
data = ma.array(data, copy=False, dtype=float64)
|
||||
p = np.atleast_1d(np.asarray(prob))
|
||||
# Computes quantiles along axis (or globally)
|
||||
if (axis is None) or (data.ndim == 1):
|
||||
result = _hd_1D(data, p, var)
|
||||
else:
|
||||
if data.ndim > 2:
|
||||
raise ValueError("Array 'data' must be at most two dimensional, "
|
||||
"but got data.ndim = %d" % data.ndim)
|
||||
result = ma.apply_along_axis(_hd_1D, axis, data, p, var)
|
||||
|
||||
return ma.fix_invalid(result, copy=False)
|
||||
|
||||
|
||||
def hdmedian(data, axis=-1, var=False):
|
||||
"""
|
||||
Returns the Harrell-Davis estimate of the median along the given axis.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ndarray
|
||||
Data array.
|
||||
axis : int, optional
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
var : bool, optional
|
||||
Whether to return the variance of the estimate.
|
||||
|
||||
Returns
|
||||
-------
|
||||
hdmedian : MaskedArray
|
||||
The median values. If ``var=True``, the variance is returned inside
|
||||
the masked array. E.g. for a 1-D array the shape change from (1,) to
|
||||
(2,).
|
||||
|
||||
"""
|
||||
result = hdquantiles(data,[0.5], axis=axis, var=var)
|
||||
return result.squeeze()
|
||||
|
||||
|
||||
def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None):
|
||||
"""
|
||||
The standard error of the Harrell-Davis quantile estimates by jackknife.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Data array.
|
||||
prob : sequence, optional
|
||||
Sequence of quantiles to compute.
|
||||
axis : int, optional
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
hdquantiles_sd : MaskedArray
|
||||
Standard error of the Harrell-Davis quantile estimates.
|
||||
|
||||
See Also
|
||||
--------
|
||||
hdquantiles
|
||||
|
||||
"""
|
||||
def _hdsd_1D(data, prob):
|
||||
"Computes the std error for 1D arrays."
|
||||
xsorted = np.sort(data.compressed())
|
||||
n = len(xsorted)
|
||||
|
||||
hdsd = np.empty(len(prob), float64)
|
||||
if n < 2:
|
||||
hdsd.flat = np.nan
|
||||
|
||||
vv = np.arange(n) / float(n-1)
|
||||
betacdf = beta.cdf
|
||||
|
||||
for (i,p) in enumerate(prob):
|
||||
_w = betacdf(vv, n*p, n*(1-p))
|
||||
w = _w[1:] - _w[:-1]
|
||||
# cumulative sum of weights and data points if
|
||||
# ith point is left out for jackknife
|
||||
mx_ = np.zeros_like(xsorted)
|
||||
mx_[1:] = np.cumsum(w * xsorted[:-1])
|
||||
# similar but from the right
|
||||
mx_[:-1] += np.cumsum(w[::-1] * xsorted[:0:-1])[::-1]
|
||||
hdsd[i] = np.sqrt(mx_.var() * (n - 1))
|
||||
return hdsd
|
||||
|
||||
# Initialization & checks
|
||||
data = ma.array(data, copy=False, dtype=float64)
|
||||
p = np.atleast_1d(np.asarray(prob))
|
||||
# Computes quantiles along axis (or globally)
|
||||
if (axis is None):
|
||||
result = _hdsd_1D(data, p)
|
||||
else:
|
||||
if data.ndim > 2:
|
||||
raise ValueError("Array 'data' must be at most two dimensional, "
|
||||
"but got data.ndim = %d" % data.ndim)
|
||||
result = ma.apply_along_axis(_hdsd_1D, axis, data, p)
|
||||
|
||||
return ma.fix_invalid(result, copy=False).ravel()
|
||||
|
||||
|
||||
def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True),
|
||||
alpha=0.05, axis=None):
|
||||
"""
|
||||
Selected confidence interval of the trimmed mean along the given axis.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Input data.
|
||||
limits : {None, tuple}, optional
|
||||
None or a two item tuple.
|
||||
Tuple of the percentages to cut on each side of the array, with respect
|
||||
to the number of unmasked data, as floats between 0. and 1. If ``n``
|
||||
is the number of unmasked data before trimming, then
|
||||
(``n * limits[0]``)th smallest data and (``n * limits[1]``)th
|
||||
largest data are masked. The total number of unmasked data after
|
||||
trimming is ``n * (1. - sum(limits))``.
|
||||
The value of one limit can be set to None to indicate an open interval.
|
||||
|
||||
Defaults to (0.2, 0.2).
|
||||
inclusive : (2,) tuple of boolean, optional
|
||||
If relative==False, tuple indicating whether values exactly equal to
|
||||
the absolute limits are allowed.
|
||||
If relative==True, tuple indicating whether the number of data being
|
||||
masked on each side should be rounded (True) or truncated (False).
|
||||
|
||||
Defaults to (True, True).
|
||||
alpha : float, optional
|
||||
Confidence level of the intervals.
|
||||
|
||||
Defaults to 0.05.
|
||||
axis : int, optional
|
||||
Axis along which to cut. If None, uses a flattened version of `data`.
|
||||
|
||||
Defaults to None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
trimmed_mean_ci : (2,) ndarray
|
||||
The lower and upper confidence intervals of the trimmed data.
|
||||
|
||||
"""
|
||||
data = ma.array(data, copy=False)
|
||||
trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis)
|
||||
tmean = trimmed.mean(axis)
|
||||
tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis)
|
||||
df = trimmed.count(axis) - 1
|
||||
tppf = t.ppf(1-alpha/2.,df)
|
||||
return np.array((tmean - tppf*tstde, tmean+tppf*tstde))
|
||||
|
||||
|
||||
def mjci(data, prob=[0.25,0.5,0.75], axis=None):
|
||||
"""
|
||||
Returns the Maritz-Jarrett estimators of the standard error of selected
|
||||
experimental quantiles of the data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ndarray
|
||||
Data array.
|
||||
prob : sequence, optional
|
||||
Sequence of quantiles to compute.
|
||||
axis : int or None, optional
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
|
||||
"""
|
||||
def _mjci_1D(data, p):
|
||||
data = np.sort(data.compressed())
|
||||
n = data.size
|
||||
prob = (np.array(p) * n + 0.5).astype(int)
|
||||
betacdf = beta.cdf
|
||||
|
||||
mj = np.empty(len(prob), float64)
|
||||
x = np.arange(1,n+1, dtype=float64) / n
|
||||
y = x - 1./n
|
||||
for (i,m) in enumerate(prob):
|
||||
W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m)
|
||||
C1 = np.dot(W,data)
|
||||
C2 = np.dot(W,data**2)
|
||||
mj[i] = np.sqrt(C2 - C1**2)
|
||||
return mj
|
||||
|
||||
data = ma.array(data, copy=False)
|
||||
if data.ndim > 2:
|
||||
raise ValueError("Array 'data' must be at most two dimensional, "
|
||||
"but got data.ndim = %d" % data.ndim)
|
||||
|
||||
p = np.atleast_1d(np.asarray(prob))
|
||||
# Computes quantiles along axis (or globally)
|
||||
if (axis is None):
|
||||
return _mjci_1D(data, p)
|
||||
else:
|
||||
return ma.apply_along_axis(_mjci_1D, axis, data, p)
|
||||
|
||||
|
||||
def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None):
|
||||
"""
|
||||
Computes the alpha confidence interval for the selected quantiles of the
|
||||
data, with Maritz-Jarrett estimators.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ndarray
|
||||
Data array.
|
||||
prob : sequence, optional
|
||||
Sequence of quantiles to compute.
|
||||
alpha : float, optional
|
||||
Confidence level of the intervals.
|
||||
axis : int or None, optional
|
||||
Axis along which to compute the quantiles.
|
||||
If None, use a flattened array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ci_lower : ndarray
|
||||
The lower boundaries of the confidence interval. Of the same length as
|
||||
`prob`.
|
||||
ci_upper : ndarray
|
||||
The upper boundaries of the confidence interval. Of the same length as
|
||||
`prob`.
|
||||
|
||||
"""
|
||||
alpha = min(alpha, 1 - alpha)
|
||||
z = norm.ppf(1 - alpha/2.)
|
||||
xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
|
||||
smj = mjci(data, prob, axis=axis)
|
||||
return (xq - z * smj, xq + z * smj)
|
||||
|
||||
|
||||
def median_cihs(data, alpha=0.05, axis=None):
|
||||
"""
|
||||
Computes the alpha-level confidence interval for the median of the data.
|
||||
|
||||
Uses the Hettmasperger-Sheather method.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Input data. Masked values are discarded. The input should be 1D only,
|
||||
or `axis` should be set to None.
|
||||
alpha : float, optional
|
||||
Confidence level of the intervals.
|
||||
axis : int or None, optional
|
||||
Axis along which to compute the quantiles. If None, use a flattened
|
||||
array.
|
||||
|
||||
Returns
|
||||
-------
|
||||
median_cihs
|
||||
Alpha level confidence interval.
|
||||
|
||||
"""
|
||||
def _cihs_1D(data, alpha):
|
||||
data = np.sort(data.compressed())
|
||||
n = len(data)
|
||||
alpha = min(alpha, 1-alpha)
|
||||
k = int(binom._ppf(alpha/2., n, 0.5))
|
||||
gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
|
||||
if gk < 1-alpha:
|
||||
k -= 1
|
||||
gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
|
||||
gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5)
|
||||
I = (gk - 1 + alpha)/(gk - gkk)
|
||||
lambd = (n-k) * I / float(k + (n-2*k)*I)
|
||||
lims = (lambd*data[k] + (1-lambd)*data[k-1],
|
||||
lambd*data[n-k-1] + (1-lambd)*data[n-k])
|
||||
return lims
|
||||
data = ma.array(data, copy=False)
|
||||
# Computes quantiles along axis (or globally)
|
||||
if (axis is None):
|
||||
result = _cihs_1D(data, alpha)
|
||||
else:
|
||||
if data.ndim > 2:
|
||||
raise ValueError("Array 'data' must be at most two dimensional, "
|
||||
"but got data.ndim = %d" % data.ndim)
|
||||
result = ma.apply_along_axis(_cihs_1D, axis, data, alpha)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def compare_medians_ms(group_1, group_2, axis=None):
|
||||
"""
|
||||
Compares the medians from two independent groups along the given axis.
|
||||
|
||||
The comparison is performed using the McKean-Schrader estimate of the
|
||||
standard error of the medians.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
group_1 : array_like
|
||||
First dataset. Has to be of size >=7.
|
||||
group_2 : array_like
|
||||
Second dataset. Has to be of size >=7.
|
||||
axis : int, optional
|
||||
Axis along which the medians are estimated. If None, the arrays are
|
||||
flattened. If `axis` is not None, then `group_1` and `group_2`
|
||||
should have the same shape.
|
||||
|
||||
Returns
|
||||
-------
|
||||
compare_medians_ms : {float, ndarray}
|
||||
If `axis` is None, then returns a float, otherwise returns a 1-D
|
||||
ndarray of floats with a length equal to the length of `group_1`
|
||||
along `axis`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> from scipy import stats
|
||||
>>> a = [1, 2, 3, 4, 5, 6, 7]
|
||||
>>> b = [8, 9, 10, 11, 12, 13, 14]
|
||||
>>> stats.mstats.compare_medians_ms(a, b, axis=None)
|
||||
1.0693225866553746e-05
|
||||
|
||||
The function is vectorized to compute along a given axis.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> x = rng.random(size=(3, 7))
|
||||
>>> y = rng.random(size=(3, 8))
|
||||
>>> stats.mstats.compare_medians_ms(x, y, axis=1)
|
||||
array([0.36908985, 0.36092538, 0.2765313 ])
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] McKean, Joseph W., and Ronald M. Schrader. "A comparison of methods
|
||||
for studentizing the sample median." Communications in
|
||||
Statistics-Simulation and Computation 13.6 (1984): 751-773.
|
||||
|
||||
"""
|
||||
(med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis))
|
||||
(std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
|
||||
mstats.stde_median(group_2, axis=axis))
|
||||
W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)
|
||||
return 1 - norm.cdf(W)
|
||||
|
||||
|
||||
def idealfourths(data, axis=None):
|
||||
"""
|
||||
Returns an estimate of the lower and upper quartiles.
|
||||
|
||||
Uses the ideal fourths algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Input array.
|
||||
axis : int, optional
|
||||
Axis along which the quartiles are estimated. If None, the arrays are
|
||||
flattened.
|
||||
|
||||
Returns
|
||||
-------
|
||||
idealfourths : {list of floats, masked array}
|
||||
Returns the two internal values that divide `data` into four parts
|
||||
using the ideal fourths algorithm either along the flattened array
|
||||
(if `axis` is None) or along `axis` of `data`.
|
||||
|
||||
"""
|
||||
def _idf(data):
|
||||
x = data.compressed()
|
||||
n = len(x)
|
||||
if n < 3:
|
||||
return [np.nan,np.nan]
|
||||
(j,h) = divmod(n/4. + 5/12.,1)
|
||||
j = int(j)
|
||||
qlo = (1-h)*x[j-1] + h*x[j]
|
||||
k = n - j
|
||||
qup = (1-h)*x[k] + h*x[k-1]
|
||||
return [qlo, qup]
|
||||
data = ma.sort(data, axis=axis).view(MaskedArray)
|
||||
if (axis is None):
|
||||
return _idf(data)
|
||||
else:
|
||||
return ma.apply_along_axis(_idf, axis, data)
|
||||
|
||||
|
||||
def rsh(data, points=None):
|
||||
"""
|
||||
Evaluates Rosenblatt's shifted histogram estimators for each data point.
|
||||
|
||||
Rosenblatt's estimator is a centered finite-difference approximation to the
|
||||
derivative of the empirical cumulative distribution function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : sequence
|
||||
Input data, should be 1-D. Masked values are ignored.
|
||||
points : sequence or None, optional
|
||||
Sequence of points where to evaluate Rosenblatt shifted histogram.
|
||||
If None, use the data.
|
||||
|
||||
"""
|
||||
data = ma.array(data, copy=False)
|
||||
if points is None:
|
||||
points = data
|
||||
else:
|
||||
points = np.atleast_1d(np.asarray(points))
|
||||
|
||||
if data.ndim != 1:
|
||||
raise AttributeError("The input array should be 1D only !")
|
||||
|
||||
n = data.count()
|
||||
r = idealfourths(data, axis=None)
|
||||
h = 1.2 * (r[-1]-r[0]) / n**(1./5)
|
||||
nhi = (data[:,None] <= points[None,:] + h).sum(0)
|
||||
nlo = (data[:,None] < points[None,:] - h).sum(0)
|
||||
return (nhi-nlo) / (2.*n*h)
|
||||
459
venv/lib/python3.12/site-packages/scipy/stats/_multicomp.py
Normal file
459
venv/lib/python3.12/site-packages/scipy/stats/_multicomp.py
Normal file
@ -0,0 +1,459 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import warnings
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import numpy as np
|
||||
|
||||
from scipy import stats
|
||||
from scipy.optimize import minimize_scalar
|
||||
from scipy.stats._common import ConfidenceInterval
|
||||
from scipy.stats._qmc import check_random_state
|
||||
from scipy.stats._stats_py import _var
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import numpy.typing as npt
|
||||
from scipy._lib._util import DecimalNumber, SeedType
|
||||
from typing import Literal, Sequence # noqa: UP035
|
||||
|
||||
|
||||
__all__ = [
|
||||
'dunnett'
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class DunnettResult:
|
||||
"""Result object returned by `scipy.stats.dunnett`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
statistic : float ndarray
|
||||
The computed statistic of the test for each comparison. The element
|
||||
at index ``i`` is the statistic for the comparison between
|
||||
groups ``i`` and the control.
|
||||
pvalue : float ndarray
|
||||
The computed p-value of the test for each comparison. The element
|
||||
at index ``i`` is the p-value for the comparison between
|
||||
group ``i`` and the control.
|
||||
"""
|
||||
statistic: np.ndarray
|
||||
pvalue: np.ndarray
|
||||
_alternative: Literal['two-sided', 'less', 'greater'] = field(repr=False)
|
||||
_rho: np.ndarray = field(repr=False)
|
||||
_df: int = field(repr=False)
|
||||
_std: float = field(repr=False)
|
||||
_mean_samples: np.ndarray = field(repr=False)
|
||||
_mean_control: np.ndarray = field(repr=False)
|
||||
_n_samples: np.ndarray = field(repr=False)
|
||||
_n_control: int = field(repr=False)
|
||||
_rng: SeedType = field(repr=False)
|
||||
_ci: ConfidenceInterval | None = field(default=None, repr=False)
|
||||
_ci_cl: DecimalNumber | None = field(default=None, repr=False)
|
||||
|
||||
def __str__(self):
|
||||
# Note: `__str__` prints the confidence intervals from the most
|
||||
# recent call to `confidence_interval`. If it has not been called,
|
||||
# it will be called with the default CL of .95.
|
||||
if self._ci is None:
|
||||
self.confidence_interval(confidence_level=.95)
|
||||
s = (
|
||||
"Dunnett's test"
|
||||
f" ({self._ci_cl*100:.1f}% Confidence Interval)\n"
|
||||
"Comparison Statistic p-value Lower CI Upper CI\n"
|
||||
)
|
||||
for i in range(self.pvalue.size):
|
||||
s += (f" (Sample {i} - Control) {self.statistic[i]:>10.3f}"
|
||||
f"{self.pvalue[i]:>10.3f}"
|
||||
f"{self._ci.low[i]:>10.3f}"
|
||||
f"{self._ci.high[i]:>10.3f}\n")
|
||||
|
||||
return s
|
||||
|
||||
def _allowance(
|
||||
self, confidence_level: DecimalNumber = 0.95, tol: DecimalNumber = 1e-3
|
||||
) -> float:
|
||||
"""Allowance.
|
||||
|
||||
It is the quantity to add/subtract from the observed difference
|
||||
between the means of observed groups and the mean of the control
|
||||
group. The result gives confidence limits.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
confidence_level : float, optional
|
||||
Confidence level for the computed confidence interval.
|
||||
Default is .95.
|
||||
tol : float, optional
|
||||
A tolerance for numerical optimization: the allowance will produce
|
||||
a confidence within ``10*tol*(1 - confidence_level)`` of the
|
||||
specified level, or a warning will be emitted. Tight tolerances
|
||||
may be impractical due to noisy evaluation of the objective.
|
||||
Default is 1e-3.
|
||||
|
||||
Returns
|
||||
-------
|
||||
allowance : float
|
||||
Allowance around the mean.
|
||||
"""
|
||||
alpha = 1 - confidence_level
|
||||
|
||||
def pvalue_from_stat(statistic):
|
||||
statistic = np.array(statistic)
|
||||
sf = _pvalue_dunnett(
|
||||
rho=self._rho, df=self._df,
|
||||
statistic=statistic, alternative=self._alternative,
|
||||
rng=self._rng
|
||||
)
|
||||
return abs(sf - alpha)/alpha
|
||||
|
||||
# Evaluation of `pvalue_from_stat` is noisy due to the use of RQMC to
|
||||
# evaluate `multivariate_t.cdf`. `minimize_scalar` is not designed
|
||||
# to tolerate a noisy objective function and may fail to find the
|
||||
# minimum accurately. We mitigate this possibility with the validation
|
||||
# step below, but implementation of a noise-tolerant root finder or
|
||||
# minimizer would be a welcome enhancement. See gh-18150.
|
||||
res = minimize_scalar(pvalue_from_stat, method='brent', tol=tol)
|
||||
critical_value = res.x
|
||||
|
||||
# validation
|
||||
# tol*10 because tol=1e-3 means we tolerate a 1% change at most
|
||||
if res.success is False or res.fun >= tol*10:
|
||||
warnings.warn(
|
||||
"Computation of the confidence interval did not converge to "
|
||||
"the desired level. The confidence level corresponding with "
|
||||
f"the returned interval is approximately {alpha*(1+res.fun)}.",
|
||||
stacklevel=3
|
||||
)
|
||||
|
||||
# From [1] p. 1101 between (1) and (3)
|
||||
allowance = critical_value*self._std*np.sqrt(
|
||||
1/self._n_samples + 1/self._n_control
|
||||
)
|
||||
return abs(allowance)
|
||||
|
||||
def confidence_interval(
|
||||
self, confidence_level: DecimalNumber = 0.95
|
||||
) -> ConfidenceInterval:
|
||||
"""Compute the confidence interval for the specified confidence level.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
confidence_level : float, optional
|
||||
Confidence level for the computed confidence interval.
|
||||
Default is .95.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ci : ``ConfidenceInterval`` object
|
||||
The object has attributes ``low`` and ``high`` that hold the
|
||||
lower and upper bounds of the confidence intervals for each
|
||||
comparison. The high and low values are accessible for each
|
||||
comparison at index ``i`` for each group ``i``.
|
||||
|
||||
"""
|
||||
# check to see if the supplied confidence level matches that of the
|
||||
# previously computed CI.
|
||||
if (self._ci is not None) and (confidence_level == self._ci_cl):
|
||||
return self._ci
|
||||
|
||||
if not (0 < confidence_level < 1):
|
||||
raise ValueError("Confidence level must be between 0 and 1.")
|
||||
|
||||
allowance = self._allowance(confidence_level=confidence_level)
|
||||
diff_means = self._mean_samples - self._mean_control
|
||||
|
||||
low = diff_means-allowance
|
||||
high = diff_means+allowance
|
||||
|
||||
if self._alternative == 'greater':
|
||||
high = [np.inf] * len(diff_means)
|
||||
elif self._alternative == 'less':
|
||||
low = [-np.inf] * len(diff_means)
|
||||
|
||||
self._ci_cl = confidence_level
|
||||
self._ci = ConfidenceInterval(
|
||||
low=low,
|
||||
high=high
|
||||
)
|
||||
return self._ci
|
||||
|
||||
|
||||
def dunnett(
|
||||
*samples: npt.ArrayLike, # noqa: D417
|
||||
control: npt.ArrayLike,
|
||||
alternative: Literal['two-sided', 'less', 'greater'] = "two-sided",
|
||||
random_state: SeedType = None
|
||||
) -> DunnettResult:
|
||||
"""Dunnett's test: multiple comparisons of means against a control group.
|
||||
|
||||
This is an implementation of Dunnett's original, single-step test as
|
||||
described in [1]_.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sample1, sample2, ... : 1D array_like
|
||||
The sample measurements for each experimental group.
|
||||
control : 1D array_like
|
||||
The sample measurements for the control group.
|
||||
alternative : {'two-sided', 'less', 'greater'}, optional
|
||||
Defines the alternative hypothesis.
|
||||
|
||||
The null hypothesis is that the means of the distributions underlying
|
||||
the samples and control are equal. The following alternative
|
||||
hypotheses are available (default is 'two-sided'):
|
||||
|
||||
* 'two-sided': the means of the distributions underlying the samples
|
||||
and control are unequal.
|
||||
* 'less': the means of the distributions underlying the samples
|
||||
are less than the mean of the distribution underlying the control.
|
||||
* 'greater': the means of the distributions underlying the
|
||||
samples are greater than the mean of the distribution underlying
|
||||
the control.
|
||||
random_state : {None, int, `numpy.random.Generator`}, optional
|
||||
If `random_state` is an int or None, a new `numpy.random.Generator` is
|
||||
created using ``np.random.default_rng(random_state)``.
|
||||
If `random_state` is already a ``Generator`` instance, then the
|
||||
provided instance is used.
|
||||
|
||||
The random number generator is used to control the randomized
|
||||
Quasi-Monte Carlo integration of the multivariate-t distribution.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : `~scipy.stats._result_classes.DunnettResult`
|
||||
An object containing attributes:
|
||||
|
||||
statistic : float ndarray
|
||||
The computed statistic of the test for each comparison. The element
|
||||
at index ``i`` is the statistic for the comparison between
|
||||
groups ``i`` and the control.
|
||||
pvalue : float ndarray
|
||||
The computed p-value of the test for each comparison. The element
|
||||
at index ``i`` is the p-value for the comparison between
|
||||
group ``i`` and the control.
|
||||
|
||||
And the following method:
|
||||
|
||||
confidence_interval(confidence_level=0.95) :
|
||||
Compute the difference in means of the groups
|
||||
with the control +- the allowance.
|
||||
|
||||
See Also
|
||||
--------
|
||||
tukey_hsd : performs pairwise comparison of means.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Like the independent-sample t-test, Dunnett's test [1]_ is used to make
|
||||
inferences about the means of distributions from which samples were drawn.
|
||||
However, when multiple t-tests are performed at a fixed significance level,
|
||||
the "family-wise error rate" - the probability of incorrectly rejecting the
|
||||
null hypothesis in at least one test - will exceed the significance level.
|
||||
Dunnett's test is designed to perform multiple comparisons while
|
||||
controlling the family-wise error rate.
|
||||
|
||||
Dunnett's test compares the means of multiple experimental groups
|
||||
against a single control group. Tukey's Honestly Significant Difference Test
|
||||
is another multiple-comparison test that controls the family-wise error
|
||||
rate, but `tukey_hsd` performs *all* pairwise comparisons between groups.
|
||||
When pairwise comparisons between experimental groups are not needed,
|
||||
Dunnett's test is preferable due to its higher power.
|
||||
|
||||
|
||||
The use of this test relies on several assumptions.
|
||||
|
||||
1. The observations are independent within and among groups.
|
||||
2. The observations within each group are normally distributed.
|
||||
3. The distributions from which the samples are drawn have the same finite
|
||||
variance.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Charles W. Dunnett. "A Multiple Comparison Procedure for Comparing
|
||||
Several Treatments with a Control."
|
||||
Journal of the American Statistical Association, 50:272, 1096-1121,
|
||||
:doi:`10.1080/01621459.1955.10501294`, 1955.
|
||||
|
||||
Examples
|
||||
--------
|
||||
In [1]_, the influence of drugs on blood count measurements on three groups
|
||||
of animal is investigated.
|
||||
|
||||
The following table summarizes the results of the experiment in which
|
||||
two groups received different drugs, and one group acted as a control.
|
||||
Blood counts (in millions of cells per cubic millimeter) were recorded::
|
||||
|
||||
>>> import numpy as np
|
||||
>>> control = np.array([7.40, 8.50, 7.20, 8.24, 9.84, 8.32])
|
||||
>>> drug_a = np.array([9.76, 8.80, 7.68, 9.36])
|
||||
>>> drug_b = np.array([12.80, 9.68, 12.16, 9.20, 10.55])
|
||||
|
||||
We would like to see if the means between any of the groups are
|
||||
significantly different. First, visually examine a box and whisker plot.
|
||||
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> fig, ax = plt.subplots(1, 1)
|
||||
>>> ax.boxplot([control, drug_a, drug_b])
|
||||
>>> ax.set_xticklabels(["Control", "Drug A", "Drug B"]) # doctest: +SKIP
|
||||
>>> ax.set_ylabel("mean") # doctest: +SKIP
|
||||
>>> plt.show()
|
||||
|
||||
Note the overlapping interquartile ranges of the drug A group and control
|
||||
group and the apparent separation between the drug B group and control
|
||||
group.
|
||||
|
||||
Next, we will use Dunnett's test to assess whether the difference
|
||||
between group means is significant while controlling the family-wise error
|
||||
rate: the probability of making any false discoveries.
|
||||
Let the null hypothesis be that the experimental groups have the same
|
||||
mean as the control and the alternative be that an experimental group does
|
||||
not have the same mean as the control. We will consider a 5% family-wise
|
||||
error rate to be acceptable, and therefore we choose 0.05 as the threshold
|
||||
for significance.
|
||||
|
||||
>>> from scipy.stats import dunnett
|
||||
>>> res = dunnett(drug_a, drug_b, control=control)
|
||||
>>> res.pvalue
|
||||
array([0.62004941, 0.0059035 ]) # may vary
|
||||
|
||||
The p-value corresponding with the comparison between group A and control
|
||||
exceeds 0.05, so we do not reject the null hypothesis for that comparison.
|
||||
However, the p-value corresponding with the comparison between group B
|
||||
and control is less than 0.05, so we consider the experimental results
|
||||
to be evidence against the null hypothesis in favor of the alternative:
|
||||
group B has a different mean than the control group.
|
||||
|
||||
"""
|
||||
samples_, control_, rng = _iv_dunnett(
|
||||
samples=samples, control=control,
|
||||
alternative=alternative, random_state=random_state
|
||||
)
|
||||
|
||||
rho, df, n_group, n_samples, n_control = _params_dunnett(
|
||||
samples=samples_, control=control_
|
||||
)
|
||||
|
||||
statistic, std, mean_control, mean_samples = _statistic_dunnett(
|
||||
samples_, control_, df, n_samples, n_control
|
||||
)
|
||||
|
||||
pvalue = _pvalue_dunnett(
|
||||
rho=rho, df=df, statistic=statistic, alternative=alternative, rng=rng
|
||||
)
|
||||
|
||||
return DunnettResult(
|
||||
statistic=statistic, pvalue=pvalue,
|
||||
_alternative=alternative,
|
||||
_rho=rho, _df=df, _std=std,
|
||||
_mean_samples=mean_samples,
|
||||
_mean_control=mean_control,
|
||||
_n_samples=n_samples,
|
||||
_n_control=n_control,
|
||||
_rng=rng
|
||||
)
|
||||
|
||||
|
||||
def _iv_dunnett(
|
||||
samples: Sequence[npt.ArrayLike],
|
||||
control: npt.ArrayLike,
|
||||
alternative: Literal['two-sided', 'less', 'greater'],
|
||||
random_state: SeedType
|
||||
) -> tuple[list[np.ndarray], np.ndarray, SeedType]:
|
||||
"""Input validation for Dunnett's test."""
|
||||
rng = check_random_state(random_state)
|
||||
|
||||
if alternative not in {'two-sided', 'less', 'greater'}:
|
||||
raise ValueError(
|
||||
"alternative must be 'less', 'greater' or 'two-sided'"
|
||||
)
|
||||
|
||||
ndim_msg = "Control and samples groups must be 1D arrays"
|
||||
n_obs_msg = "Control and samples groups must have at least 1 observation"
|
||||
|
||||
control = np.asarray(control)
|
||||
samples_ = [np.asarray(sample) for sample in samples]
|
||||
|
||||
# samples checks
|
||||
samples_control: list[np.ndarray] = samples_ + [control]
|
||||
for sample in samples_control:
|
||||
if sample.ndim > 1:
|
||||
raise ValueError(ndim_msg)
|
||||
|
||||
if sample.size < 1:
|
||||
raise ValueError(n_obs_msg)
|
||||
|
||||
return samples_, control, rng
|
||||
|
||||
|
||||
def _params_dunnett(
|
||||
samples: list[np.ndarray], control: np.ndarray
|
||||
) -> tuple[np.ndarray, int, int, np.ndarray, int]:
|
||||
"""Specific parameters for Dunnett's test.
|
||||
|
||||
Degree of freedom is the number of observations minus the number of groups
|
||||
including the control.
|
||||
"""
|
||||
n_samples = np.array([sample.size for sample in samples])
|
||||
|
||||
# From [1] p. 1100 d.f. = (sum N)-(p+1)
|
||||
n_sample = n_samples.sum()
|
||||
n_control = control.size
|
||||
n = n_sample + n_control
|
||||
n_groups = len(samples)
|
||||
df = n - n_groups - 1
|
||||
|
||||
# From [1] p. 1103 rho_ij = 1/sqrt((N0/Ni+1)(N0/Nj+1))
|
||||
rho = n_control/n_samples + 1
|
||||
rho = 1/np.sqrt(rho[:, None] * rho[None, :])
|
||||
np.fill_diagonal(rho, 1)
|
||||
|
||||
return rho, df, n_groups, n_samples, n_control
|
||||
|
||||
|
||||
def _statistic_dunnett(
|
||||
samples: list[np.ndarray], control: np.ndarray, df: int,
|
||||
n_samples: np.ndarray, n_control: int
|
||||
) -> tuple[np.ndarray, float, np.ndarray, np.ndarray]:
|
||||
"""Statistic of Dunnett's test.
|
||||
|
||||
Computation based on the original single-step test from [1].
|
||||
"""
|
||||
mean_control = np.mean(control)
|
||||
mean_samples = np.array([np.mean(sample) for sample in samples])
|
||||
all_samples = [control] + samples
|
||||
all_means = np.concatenate([[mean_control], mean_samples])
|
||||
|
||||
# Variance estimate s^2 from [1] Eq. 1
|
||||
s2 = np.sum([_var(sample, mean=mean)*sample.size
|
||||
for sample, mean in zip(all_samples, all_means)]) / df
|
||||
std = np.sqrt(s2)
|
||||
|
||||
# z score inferred from [1] unlabeled equation after Eq. 1
|
||||
z = (mean_samples - mean_control) / np.sqrt(1/n_samples + 1/n_control)
|
||||
|
||||
return z / std, std, mean_control, mean_samples
|
||||
|
||||
|
||||
def _pvalue_dunnett(
|
||||
rho: np.ndarray, df: int, statistic: np.ndarray,
|
||||
alternative: Literal['two-sided', 'less', 'greater'],
|
||||
rng: SeedType = None
|
||||
) -> np.ndarray:
|
||||
"""pvalue from the multivariate t-distribution.
|
||||
|
||||
Critical values come from the multivariate student-t distribution.
|
||||
"""
|
||||
statistic = statistic.reshape(-1, 1)
|
||||
|
||||
mvt = stats.multivariate_t(shape=rho, df=df, seed=rng)
|
||||
if alternative == "two-sided":
|
||||
statistic = abs(statistic)
|
||||
pvalue = 1 - mvt.cdf(statistic, lower_limit=-statistic)
|
||||
elif alternative == "greater":
|
||||
pvalue = 1 - mvt.cdf(statistic, lower_limit=-np.inf)
|
||||
else:
|
||||
pvalue = 1 - mvt.cdf(np.inf, lower_limit=statistic)
|
||||
|
||||
return np.atleast_1d(pvalue)
|
||||
6981
venv/lib/python3.12/site-packages/scipy/stats/_multivariate.py
Normal file
6981
venv/lib/python3.12/site-packages/scipy/stats/_multivariate.py
Normal file
File diff suppressed because it is too large
Load Diff
Binary file not shown.
482
venv/lib/python3.12/site-packages/scipy/stats/_odds_ratio.py
Normal file
482
venv/lib/python3.12/site-packages/scipy/stats/_odds_ratio.py
Normal file
@ -0,0 +1,482 @@
|
||||
import numpy as np
|
||||
|
||||
from scipy.special import ndtri
|
||||
from scipy.optimize import brentq
|
||||
from ._discrete_distns import nchypergeom_fisher
|
||||
from ._common import ConfidenceInterval
|
||||
|
||||
|
||||
def _sample_odds_ratio(table):
|
||||
"""
|
||||
Given a table [[a, b], [c, d]], compute a*d/(b*c).
|
||||
|
||||
Return nan if the numerator and denominator are 0.
|
||||
Return inf if just the denominator is 0.
|
||||
"""
|
||||
# table must be a 2x2 numpy array.
|
||||
if table[1, 0] > 0 and table[0, 1] > 0:
|
||||
oddsratio = table[0, 0] * table[1, 1] / (table[1, 0] * table[0, 1])
|
||||
elif table[0, 0] == 0 or table[1, 1] == 0:
|
||||
oddsratio = np.nan
|
||||
else:
|
||||
oddsratio = np.inf
|
||||
return oddsratio
|
||||
|
||||
|
||||
def _solve(func):
|
||||
"""
|
||||
Solve func(nc) = 0. func must be an increasing function.
|
||||
"""
|
||||
# We could just as well call the variable `x` instead of `nc`, but we
|
||||
# always call this function with functions for which nc (the noncentrality
|
||||
# parameter) is the variable for which we are solving.
|
||||
nc = 1.0
|
||||
value = func(nc)
|
||||
if value == 0:
|
||||
return nc
|
||||
|
||||
# Multiplicative factor by which to increase or decrease nc when
|
||||
# searching for a bracketing interval.
|
||||
factor = 2.0
|
||||
# Find a bracketing interval.
|
||||
if value > 0:
|
||||
nc /= factor
|
||||
while func(nc) > 0:
|
||||
nc /= factor
|
||||
lo = nc
|
||||
hi = factor*nc
|
||||
else:
|
||||
nc *= factor
|
||||
while func(nc) < 0:
|
||||
nc *= factor
|
||||
lo = nc/factor
|
||||
hi = nc
|
||||
|
||||
# lo and hi bracket the solution for nc.
|
||||
nc = brentq(func, lo, hi, xtol=1e-13)
|
||||
return nc
|
||||
|
||||
|
||||
def _nc_hypergeom_mean_inverse(x, M, n, N):
|
||||
"""
|
||||
For the given noncentral hypergeometric parameters x, M, n,and N
|
||||
(table[0,0], total, row 0 sum and column 0 sum, resp., of a 2x2
|
||||
contingency table), find the noncentrality parameter of Fisher's
|
||||
noncentral hypergeometric distribution whose mean is x.
|
||||
"""
|
||||
nc = _solve(lambda nc: nchypergeom_fisher.mean(M, n, N, nc) - x)
|
||||
return nc
|
||||
|
||||
|
||||
def _hypergeom_params_from_table(table):
|
||||
# The notation M, n and N is consistent with stats.hypergeom and
|
||||
# stats.nchypergeom_fisher.
|
||||
x = table[0, 0]
|
||||
M = table.sum()
|
||||
n = table[0].sum()
|
||||
N = table[:, 0].sum()
|
||||
return x, M, n, N
|
||||
|
||||
|
||||
def _ci_upper(table, alpha):
|
||||
"""
|
||||
Compute the upper end of the confidence interval.
|
||||
"""
|
||||
if _sample_odds_ratio(table) == np.inf:
|
||||
return np.inf
|
||||
|
||||
x, M, n, N = _hypergeom_params_from_table(table)
|
||||
|
||||
# nchypergeom_fisher.cdf is a decreasing function of nc, so we negate
|
||||
# it in the lambda expression.
|
||||
nc = _solve(lambda nc: -nchypergeom_fisher.cdf(x, M, n, N, nc) + alpha)
|
||||
return nc
|
||||
|
||||
|
||||
def _ci_lower(table, alpha):
|
||||
"""
|
||||
Compute the lower end of the confidence interval.
|
||||
"""
|
||||
if _sample_odds_ratio(table) == 0:
|
||||
return 0
|
||||
|
||||
x, M, n, N = _hypergeom_params_from_table(table)
|
||||
|
||||
nc = _solve(lambda nc: nchypergeom_fisher.sf(x - 1, M, n, N, nc) - alpha)
|
||||
return nc
|
||||
|
||||
|
||||
def _conditional_oddsratio(table):
|
||||
"""
|
||||
Conditional MLE of the odds ratio for the 2x2 contingency table.
|
||||
"""
|
||||
x, M, n, N = _hypergeom_params_from_table(table)
|
||||
# Get the bounds of the support. The support of the noncentral
|
||||
# hypergeometric distribution with parameters M, n, and N is the same
|
||||
# for all values of the noncentrality parameter, so we can use 1 here.
|
||||
lo, hi = nchypergeom_fisher.support(M, n, N, 1)
|
||||
|
||||
# Check if x is at one of the extremes of the support. If so, we know
|
||||
# the odds ratio is either 0 or inf.
|
||||
if x == lo:
|
||||
# x is at the low end of the support.
|
||||
return 0
|
||||
if x == hi:
|
||||
# x is at the high end of the support.
|
||||
return np.inf
|
||||
|
||||
nc = _nc_hypergeom_mean_inverse(x, M, n, N)
|
||||
return nc
|
||||
|
||||
|
||||
def _conditional_oddsratio_ci(table, confidence_level=0.95,
|
||||
alternative='two-sided'):
|
||||
"""
|
||||
Conditional exact confidence interval for the odds ratio.
|
||||
"""
|
||||
if alternative == 'two-sided':
|
||||
alpha = 0.5*(1 - confidence_level)
|
||||
lower = _ci_lower(table, alpha)
|
||||
upper = _ci_upper(table, alpha)
|
||||
elif alternative == 'less':
|
||||
lower = 0.0
|
||||
upper = _ci_upper(table, 1 - confidence_level)
|
||||
else:
|
||||
# alternative == 'greater'
|
||||
lower = _ci_lower(table, 1 - confidence_level)
|
||||
upper = np.inf
|
||||
|
||||
return lower, upper
|
||||
|
||||
|
||||
def _sample_odds_ratio_ci(table, confidence_level=0.95,
|
||||
alternative='two-sided'):
|
||||
oddsratio = _sample_odds_ratio(table)
|
||||
log_or = np.log(oddsratio)
|
||||
se = np.sqrt((1/table).sum())
|
||||
if alternative == 'less':
|
||||
z = ndtri(confidence_level)
|
||||
loglow = -np.inf
|
||||
loghigh = log_or + z*se
|
||||
elif alternative == 'greater':
|
||||
z = ndtri(confidence_level)
|
||||
loglow = log_or - z*se
|
||||
loghigh = np.inf
|
||||
else:
|
||||
# alternative is 'two-sided'
|
||||
z = ndtri(0.5*confidence_level + 0.5)
|
||||
loglow = log_or - z*se
|
||||
loghigh = log_or + z*se
|
||||
|
||||
return np.exp(loglow), np.exp(loghigh)
|
||||
|
||||
|
||||
class OddsRatioResult:
|
||||
"""
|
||||
Result of `scipy.stats.contingency.odds_ratio`. See the
|
||||
docstring for `odds_ratio` for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
statistic : float
|
||||
The computed odds ratio.
|
||||
|
||||
* If `kind` is ``'sample'``, this is sample (or unconditional)
|
||||
estimate, given by
|
||||
``table[0, 0]*table[1, 1]/(table[0, 1]*table[1, 0])``.
|
||||
* If `kind` is ``'conditional'``, this is the conditional
|
||||
maximum likelihood estimate for the odds ratio. It is
|
||||
the noncentrality parameter of Fisher's noncentral
|
||||
hypergeometric distribution with the same hypergeometric
|
||||
parameters as `table` and whose mean is ``table[0, 0]``.
|
||||
|
||||
Methods
|
||||
-------
|
||||
confidence_interval :
|
||||
Confidence interval for the odds ratio.
|
||||
"""
|
||||
|
||||
def __init__(self, _table, _kind, statistic):
|
||||
# for now, no need to make _table and _kind public, since this sort of
|
||||
# information is returned in very few `scipy.stats` results
|
||||
self._table = _table
|
||||
self._kind = _kind
|
||||
self.statistic = statistic
|
||||
|
||||
def __repr__(self):
|
||||
return f"OddsRatioResult(statistic={self.statistic})"
|
||||
|
||||
def confidence_interval(self, confidence_level=0.95,
|
||||
alternative='two-sided'):
|
||||
"""
|
||||
Confidence interval for the odds ratio.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
confidence_level: float
|
||||
Desired confidence level for the confidence interval.
|
||||
The value must be given as a fraction between 0 and 1.
|
||||
Default is 0.95 (meaning 95%).
|
||||
|
||||
alternative : {'two-sided', 'less', 'greater'}, optional
|
||||
The alternative hypothesis of the hypothesis test to which the
|
||||
confidence interval corresponds. That is, suppose the null
|
||||
hypothesis is that the true odds ratio equals ``OR`` and the
|
||||
confidence interval is ``(low, high)``. Then the following options
|
||||
for `alternative` are available (default is 'two-sided'):
|
||||
|
||||
* 'two-sided': the true odds ratio is not equal to ``OR``. There
|
||||
is evidence against the null hypothesis at the chosen
|
||||
`confidence_level` if ``high < OR`` or ``low > OR``.
|
||||
* 'less': the true odds ratio is less than ``OR``. The ``low`` end
|
||||
of the confidence interval is 0, and there is evidence against
|
||||
the null hypothesis at the chosen `confidence_level` if
|
||||
``high < OR``.
|
||||
* 'greater': the true odds ratio is greater than ``OR``. The
|
||||
``high`` end of the confidence interval is ``np.inf``, and there
|
||||
is evidence against the null hypothesis at the chosen
|
||||
`confidence_level` if ``low > OR``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ci : ``ConfidenceInterval`` instance
|
||||
The confidence interval, represented as an object with
|
||||
attributes ``low`` and ``high``.
|
||||
|
||||
Notes
|
||||
-----
|
||||
When `kind` is ``'conditional'``, the limits of the confidence
|
||||
interval are the conditional "exact confidence limits" as described
|
||||
by Fisher [1]_. The conditional odds ratio and confidence interval are
|
||||
also discussed in Section 4.1.2 of the text by Sahai and Khurshid [2]_.
|
||||
|
||||
When `kind` is ``'sample'``, the confidence interval is computed
|
||||
under the assumption that the logarithm of the odds ratio is normally
|
||||
distributed with standard error given by::
|
||||
|
||||
se = sqrt(1/a + 1/b + 1/c + 1/d)
|
||||
|
||||
where ``a``, ``b``, ``c`` and ``d`` are the elements of the
|
||||
contingency table. (See, for example, [2]_, section 3.1.3.2,
|
||||
or [3]_, section 2.3.3).
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] R. A. Fisher (1935), The logic of inductive inference,
|
||||
Journal of the Royal Statistical Society, Vol. 98, No. 1,
|
||||
pp. 39-82.
|
||||
.. [2] H. Sahai and A. Khurshid (1996), Statistics in Epidemiology:
|
||||
Methods, Techniques, and Applications, CRC Press LLC, Boca
|
||||
Raton, Florida.
|
||||
.. [3] Alan Agresti, An Introduction to Categorical Data Analysis
|
||||
(second edition), Wiley, Hoboken, NJ, USA (2007).
|
||||
"""
|
||||
if alternative not in ['two-sided', 'less', 'greater']:
|
||||
raise ValueError("`alternative` must be 'two-sided', 'less' or "
|
||||
"'greater'.")
|
||||
|
||||
if confidence_level < 0 or confidence_level > 1:
|
||||
raise ValueError('confidence_level must be between 0 and 1')
|
||||
|
||||
if self._kind == 'conditional':
|
||||
ci = self._conditional_odds_ratio_ci(confidence_level, alternative)
|
||||
else:
|
||||
ci = self._sample_odds_ratio_ci(confidence_level, alternative)
|
||||
return ci
|
||||
|
||||
def _conditional_odds_ratio_ci(self, confidence_level=0.95,
|
||||
alternative='two-sided'):
|
||||
"""
|
||||
Confidence interval for the conditional odds ratio.
|
||||
"""
|
||||
|
||||
table = self._table
|
||||
if 0 in table.sum(axis=0) or 0 in table.sum(axis=1):
|
||||
# If both values in a row or column are zero, the p-value is 1,
|
||||
# the odds ratio is NaN and the confidence interval is (0, inf).
|
||||
ci = (0, np.inf)
|
||||
else:
|
||||
ci = _conditional_oddsratio_ci(table,
|
||||
confidence_level=confidence_level,
|
||||
alternative=alternative)
|
||||
return ConfidenceInterval(low=ci[0], high=ci[1])
|
||||
|
||||
def _sample_odds_ratio_ci(self, confidence_level=0.95,
|
||||
alternative='two-sided'):
|
||||
"""
|
||||
Confidence interval for the sample odds ratio.
|
||||
"""
|
||||
if confidence_level < 0 or confidence_level > 1:
|
||||
raise ValueError('confidence_level must be between 0 and 1')
|
||||
|
||||
table = self._table
|
||||
if 0 in table.sum(axis=0) or 0 in table.sum(axis=1):
|
||||
# If both values in a row or column are zero, the p-value is 1,
|
||||
# the odds ratio is NaN and the confidence interval is (0, inf).
|
||||
ci = (0, np.inf)
|
||||
else:
|
||||
ci = _sample_odds_ratio_ci(table,
|
||||
confidence_level=confidence_level,
|
||||
alternative=alternative)
|
||||
return ConfidenceInterval(low=ci[0], high=ci[1])
|
||||
|
||||
|
||||
def odds_ratio(table, *, kind='conditional'):
|
||||
r"""
|
||||
Compute the odds ratio for a 2x2 contingency table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : array_like of ints
|
||||
A 2x2 contingency table. Elements must be non-negative integers.
|
||||
kind : str, optional
|
||||
Which kind of odds ratio to compute, either the sample
|
||||
odds ratio (``kind='sample'``) or the conditional odds ratio
|
||||
(``kind='conditional'``). Default is ``'conditional'``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : `~scipy.stats._result_classes.OddsRatioResult` instance
|
||||
The returned object has two computed attributes:
|
||||
|
||||
statistic : float
|
||||
* If `kind` is ``'sample'``, this is sample (or unconditional)
|
||||
estimate, given by
|
||||
``table[0, 0]*table[1, 1]/(table[0, 1]*table[1, 0])``.
|
||||
* If `kind` is ``'conditional'``, this is the conditional
|
||||
maximum likelihood estimate for the odds ratio. It is
|
||||
the noncentrality parameter of Fisher's noncentral
|
||||
hypergeometric distribution with the same hypergeometric
|
||||
parameters as `table` and whose mean is ``table[0, 0]``.
|
||||
|
||||
The object has the method `confidence_interval` that computes
|
||||
the confidence interval of the odds ratio.
|
||||
|
||||
See Also
|
||||
--------
|
||||
scipy.stats.fisher_exact
|
||||
relative_risk
|
||||
|
||||
Notes
|
||||
-----
|
||||
The conditional odds ratio was discussed by Fisher (see "Example 1"
|
||||
of [1]_). Texts that cover the odds ratio include [2]_ and [3]_.
|
||||
|
||||
.. versionadded:: 1.10.0
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] R. A. Fisher (1935), The logic of inductive inference,
|
||||
Journal of the Royal Statistical Society, Vol. 98, No. 1,
|
||||
pp. 39-82.
|
||||
.. [2] Breslow NE, Day NE (1980). Statistical methods in cancer research.
|
||||
Volume I - The analysis of case-control studies. IARC Sci Publ.
|
||||
(32):5-338. PMID: 7216345. (See section 4.2.)
|
||||
.. [3] H. Sahai and A. Khurshid (1996), Statistics in Epidemiology:
|
||||
Methods, Techniques, and Applications, CRC Press LLC, Boca
|
||||
Raton, Florida.
|
||||
.. [4] Berger, Jeffrey S. et al. "Aspirin for the Primary Prevention of
|
||||
Cardiovascular Events in Women and Men: A Sex-Specific
|
||||
Meta-analysis of Randomized Controlled Trials."
|
||||
JAMA, 295(3):306-313, :doi:`10.1001/jama.295.3.306`, 2006.
|
||||
|
||||
Examples
|
||||
--------
|
||||
In epidemiology, individuals are classified as "exposed" or
|
||||
"unexposed" to some factor or treatment. If the occurrence of some
|
||||
illness is under study, those who have the illness are often
|
||||
classified as "cases", and those without it are "noncases". The
|
||||
counts of the occurrences of these classes gives a contingency
|
||||
table::
|
||||
|
||||
exposed unexposed
|
||||
cases a b
|
||||
noncases c d
|
||||
|
||||
The sample odds ratio may be written ``(a/c) / (b/d)``. ``a/c`` can
|
||||
be interpreted as the odds of a case occurring in the exposed group,
|
||||
and ``b/d`` as the odds of a case occurring in the unexposed group.
|
||||
The sample odds ratio is the ratio of these odds. If the odds ratio
|
||||
is greater than 1, it suggests that there is a positive association
|
||||
between being exposed and being a case.
|
||||
|
||||
Interchanging the rows or columns of the contingency table inverts
|
||||
the odds ratio, so it is important to understand the meaning of labels
|
||||
given to the rows and columns of the table when interpreting the
|
||||
odds ratio.
|
||||
|
||||
In [4]_, the use of aspirin to prevent cardiovascular events in women
|
||||
and men was investigated. The study notably concluded:
|
||||
|
||||
...aspirin therapy reduced the risk of a composite of
|
||||
cardiovascular events due to its effect on reducing the risk of
|
||||
ischemic stroke in women [...]
|
||||
|
||||
The article lists studies of various cardiovascular events. Let's
|
||||
focus on the ischemic stoke in women.
|
||||
|
||||
The following table summarizes the results of the experiment in which
|
||||
participants took aspirin or a placebo on a regular basis for several
|
||||
years. Cases of ischemic stroke were recorded::
|
||||
|
||||
Aspirin Control/Placebo
|
||||
Ischemic stroke 176 230
|
||||
No stroke 21035 21018
|
||||
|
||||
The question we ask is "Is there evidence that the aspirin reduces the
|
||||
risk of ischemic stroke?"
|
||||
|
||||
Compute the odds ratio:
|
||||
|
||||
>>> from scipy.stats.contingency import odds_ratio
|
||||
>>> res = odds_ratio([[176, 230], [21035, 21018]])
|
||||
>>> res.statistic
|
||||
0.7646037659999126
|
||||
|
||||
For this sample, the odds of getting an ischemic stroke for those who have
|
||||
been taking aspirin are 0.76 times that of those
|
||||
who have received the placebo.
|
||||
|
||||
To make statistical inferences about the population under study,
|
||||
we can compute the 95% confidence interval for the odds ratio:
|
||||
|
||||
>>> res.confidence_interval(confidence_level=0.95)
|
||||
ConfidenceInterval(low=0.6241234078749812, high=0.9354102892100372)
|
||||
|
||||
The 95% confidence interval for the conditional odds ratio is
|
||||
approximately (0.62, 0.94).
|
||||
|
||||
The fact that the entire 95% confidence interval falls below 1 supports
|
||||
the authors' conclusion that the aspirin was associated with a
|
||||
statistically significant reduction in ischemic stroke.
|
||||
"""
|
||||
if kind not in ['conditional', 'sample']:
|
||||
raise ValueError("`kind` must be 'conditional' or 'sample'.")
|
||||
|
||||
c = np.asarray(table)
|
||||
|
||||
if c.shape != (2, 2):
|
||||
raise ValueError(f"Invalid shape {c.shape}. The input `table` must be "
|
||||
"of shape (2, 2).")
|
||||
|
||||
if not np.issubdtype(c.dtype, np.integer):
|
||||
raise ValueError("`table` must be an array of integers, but got "
|
||||
f"type {c.dtype}")
|
||||
c = c.astype(np.int64)
|
||||
|
||||
if np.any(c < 0):
|
||||
raise ValueError("All values in `table` must be nonnegative.")
|
||||
|
||||
if 0 in c.sum(axis=0) or 0 in c.sum(axis=1):
|
||||
# If both values in a row or column are zero, the p-value is NaN and
|
||||
# the odds ratio is NaN.
|
||||
result = OddsRatioResult(_table=c, _kind=kind, statistic=np.nan)
|
||||
return result
|
||||
|
||||
if kind == 'sample':
|
||||
oddsratio = _sample_odds_ratio(c)
|
||||
else: # kind is 'conditional'
|
||||
oddsratio = _conditional_oddsratio(c)
|
||||
|
||||
result = OddsRatioResult(_table=c, _kind=kind, statistic=oddsratio)
|
||||
return result
|
||||
@ -0,0 +1,479 @@
|
||||
from itertools import permutations
|
||||
import numpy as np
|
||||
import math
|
||||
from ._continuous_distns import norm
|
||||
import scipy.stats
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class PageTrendTestResult:
|
||||
statistic: float
|
||||
pvalue: float
|
||||
method: str
|
||||
|
||||
|
||||
def page_trend_test(data, ranked=False, predicted_ranks=None, method='auto'):
|
||||
r"""
|
||||
Perform Page's Test, a measure of trend in observations between treatments.
|
||||
|
||||
Page's Test (also known as Page's :math:`L` test) is useful when:
|
||||
|
||||
* there are :math:`n \geq 3` treatments,
|
||||
* :math:`m \geq 2` subjects are observed for each treatment, and
|
||||
* the observations are hypothesized to have a particular order.
|
||||
|
||||
Specifically, the test considers the null hypothesis that
|
||||
|
||||
.. math::
|
||||
|
||||
m_1 = m_2 = m_3 \cdots = m_n,
|
||||
|
||||
where :math:`m_j` is the mean of the observed quantity under treatment
|
||||
:math:`j`, against the alternative hypothesis that
|
||||
|
||||
.. math::
|
||||
|
||||
m_1 \leq m_2 \leq m_3 \leq \cdots \leq m_n,
|
||||
|
||||
where at least one inequality is strict.
|
||||
|
||||
As noted by [4]_, Page's :math:`L` test has greater statistical power than
|
||||
the Friedman test against the alternative that there is a difference in
|
||||
trend, as Friedman's test only considers a difference in the means of the
|
||||
observations without considering their order. Whereas Spearman :math:`\rho`
|
||||
considers the correlation between the ranked observations of two variables
|
||||
(e.g. the airspeed velocity of a swallow vs. the weight of the coconut it
|
||||
carries), Page's :math:`L` is concerned with a trend in an observation
|
||||
(e.g. the airspeed velocity of a swallow) across several distinct
|
||||
treatments (e.g. carrying each of five coconuts of different weight) even
|
||||
as the observation is repeated with multiple subjects (e.g. one European
|
||||
swallow and one African swallow).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array-like
|
||||
A :math:`m \times n` array; the element in row :math:`i` and
|
||||
column :math:`j` is the observation corresponding with subject
|
||||
:math:`i` and treatment :math:`j`. By default, the columns are
|
||||
assumed to be arranged in order of increasing predicted mean.
|
||||
|
||||
ranked : boolean, optional
|
||||
By default, `data` is assumed to be observations rather than ranks;
|
||||
it will be ranked with `scipy.stats.rankdata` along ``axis=1``. If
|
||||
`data` is provided in the form of ranks, pass argument ``True``.
|
||||
|
||||
predicted_ranks : array-like, optional
|
||||
The predicted ranks of the column means. If not specified,
|
||||
the columns are assumed to be arranged in order of increasing
|
||||
predicted mean, so the default `predicted_ranks` are
|
||||
:math:`[1, 2, \dots, n-1, n]`.
|
||||
|
||||
method : {'auto', 'asymptotic', 'exact'}, optional
|
||||
Selects the method used to calculate the *p*-value. The following
|
||||
options are available.
|
||||
|
||||
* 'auto': selects between 'exact' and 'asymptotic' to
|
||||
achieve reasonably accurate results in reasonable time (default)
|
||||
* 'asymptotic': compares the standardized test statistic against
|
||||
the normal distribution
|
||||
* 'exact': computes the exact *p*-value by comparing the observed
|
||||
:math:`L` statistic against those realized by all possible
|
||||
permutations of ranks (under the null hypothesis that each
|
||||
permutation is equally likely)
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : PageTrendTestResult
|
||||
An object containing attributes:
|
||||
|
||||
statistic : float
|
||||
Page's :math:`L` test statistic.
|
||||
pvalue : float
|
||||
The associated *p*-value
|
||||
method : {'asymptotic', 'exact'}
|
||||
The method used to compute the *p*-value
|
||||
|
||||
See Also
|
||||
--------
|
||||
rankdata, friedmanchisquare, spearmanr
|
||||
|
||||
Notes
|
||||
-----
|
||||
As noted in [1]_, "the :math:`n` 'treatments' could just as well represent
|
||||
:math:`n` objects or events or performances or persons or trials ranked."
|
||||
Similarly, the :math:`m` 'subjects' could equally stand for :math:`m`
|
||||
"groupings by ability or some other control variable, or judges doing
|
||||
the ranking, or random replications of some other sort."
|
||||
|
||||
The procedure for calculating the :math:`L` statistic, adapted from
|
||||
[1]_, is:
|
||||
|
||||
1. "Predetermine with careful logic the appropriate hypotheses
|
||||
concerning the predicted ordering of the experimental results.
|
||||
If no reasonable basis for ordering any treatments is known, the
|
||||
:math:`L` test is not appropriate."
|
||||
2. "As in other experiments, determine at what level of confidence
|
||||
you will reject the null hypothesis that there is no agreement of
|
||||
experimental results with the monotonic hypothesis."
|
||||
3. "Cast the experimental material into a two-way table of :math:`n`
|
||||
columns (treatments, objects ranked, conditions) and :math:`m`
|
||||
rows (subjects, replication groups, levels of control variables)."
|
||||
4. "When experimental observations are recorded, rank them across each
|
||||
row", e.g. ``ranks = scipy.stats.rankdata(data, axis=1)``.
|
||||
5. "Add the ranks in each column", e.g.
|
||||
``colsums = np.sum(ranks, axis=0)``.
|
||||
6. "Multiply each sum of ranks by the predicted rank for that same
|
||||
column", e.g. ``products = predicted_ranks * colsums``.
|
||||
7. "Sum all such products", e.g. ``L = products.sum()``.
|
||||
|
||||
[1]_ continues by suggesting use of the standardized statistic
|
||||
|
||||
.. math::
|
||||
|
||||
\chi_L^2 = \frac{\left[12L-3mn(n+1)^2\right]^2}{mn^2(n^2-1)(n+1)}
|
||||
|
||||
"which is distributed approximately as chi-square with 1 degree of
|
||||
freedom. The ordinary use of :math:`\chi^2` tables would be
|
||||
equivalent to a two-sided test of agreement. If a one-sided test
|
||||
is desired, *as will almost always be the case*, the probability
|
||||
discovered in the chi-square table should be *halved*."
|
||||
|
||||
However, this standardized statistic does not distinguish between the
|
||||
observed values being well correlated with the predicted ranks and being
|
||||
_anti_-correlated with the predicted ranks. Instead, we follow [2]_
|
||||
and calculate the standardized statistic
|
||||
|
||||
.. math::
|
||||
|
||||
\Lambda = \frac{L - E_0}{\sqrt{V_0}},
|
||||
|
||||
where :math:`E_0 = \frac{1}{4} mn(n+1)^2` and
|
||||
:math:`V_0 = \frac{1}{144} mn^2(n+1)(n^2-1)`, "which is asymptotically
|
||||
normal under the null hypothesis".
|
||||
|
||||
The *p*-value for ``method='exact'`` is generated by comparing the observed
|
||||
value of :math:`L` against the :math:`L` values generated for all
|
||||
:math:`(n!)^m` possible permutations of ranks. The calculation is performed
|
||||
using the recursive method of [5].
|
||||
|
||||
The *p*-values are not adjusted for the possibility of ties. When
|
||||
ties are present, the reported ``'exact'`` *p*-values may be somewhat
|
||||
larger (i.e. more conservative) than the true *p*-value [2]_. The
|
||||
``'asymptotic'``` *p*-values, however, tend to be smaller (i.e. less
|
||||
conservative) than the ``'exact'`` *p*-values.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Ellis Batten Page, "Ordered hypotheses for multiple treatments:
|
||||
a significant test for linear ranks", *Journal of the American
|
||||
Statistical Association* 58(301), p. 216--230, 1963.
|
||||
|
||||
.. [2] Markus Neuhauser, *Nonparametric Statistical Test: A computational
|
||||
approach*, CRC Press, p. 150--152, 2012.
|
||||
|
||||
.. [3] Statext LLC, "Page's L Trend Test - Easy Statistics", *Statext -
|
||||
Statistics Study*, https://www.statext.com/practice/PageTrendTest03.php,
|
||||
Accessed July 12, 2020.
|
||||
|
||||
.. [4] "Page's Trend Test", *Wikipedia*, WikimediaFoundation,
|
||||
https://en.wikipedia.org/wiki/Page%27s_trend_test,
|
||||
Accessed July 12, 2020.
|
||||
|
||||
.. [5] Robert E. Odeh, "The exact distribution of Page's L-statistic in
|
||||
the two-way layout", *Communications in Statistics - Simulation and
|
||||
Computation*, 6(1), p. 49--61, 1977.
|
||||
|
||||
Examples
|
||||
--------
|
||||
We use the example from [3]_: 10 students are asked to rate three
|
||||
teaching methods - tutorial, lecture, and seminar - on a scale of 1-5,
|
||||
with 1 being the lowest and 5 being the highest. We have decided that
|
||||
a confidence level of 99% is required to reject the null hypothesis in
|
||||
favor of our alternative: that the seminar will have the highest ratings
|
||||
and the tutorial will have the lowest. Initially, the data have been
|
||||
tabulated with each row representing an individual student's ratings of
|
||||
the three methods in the following order: tutorial, lecture, seminar.
|
||||
|
||||
>>> table = [[3, 4, 3],
|
||||
... [2, 2, 4],
|
||||
... [3, 3, 5],
|
||||
... [1, 3, 2],
|
||||
... [2, 3, 2],
|
||||
... [2, 4, 5],
|
||||
... [1, 2, 4],
|
||||
... [3, 4, 4],
|
||||
... [2, 4, 5],
|
||||
... [1, 3, 4]]
|
||||
|
||||
Because the tutorial is hypothesized to have the lowest ratings, the
|
||||
column corresponding with tutorial rankings should be first; the seminar
|
||||
is hypothesized to have the highest ratings, so its column should be last.
|
||||
Since the columns are already arranged in this order of increasing
|
||||
predicted mean, we can pass the table directly into `page_trend_test`.
|
||||
|
||||
>>> from scipy.stats import page_trend_test
|
||||
>>> res = page_trend_test(table)
|
||||
>>> res
|
||||
PageTrendTestResult(statistic=133.5, pvalue=0.0018191161948127822,
|
||||
method='exact')
|
||||
|
||||
This *p*-value indicates that there is a 0.1819% chance that
|
||||
the :math:`L` statistic would reach such an extreme value under the null
|
||||
hypothesis. Because 0.1819% is less than 1%, we have evidence to reject
|
||||
the null hypothesis in favor of our alternative at a 99% confidence level.
|
||||
|
||||
The value of the :math:`L` statistic is 133.5. To check this manually,
|
||||
we rank the data such that high scores correspond with high ranks, settling
|
||||
ties with an average rank:
|
||||
|
||||
>>> from scipy.stats import rankdata
|
||||
>>> ranks = rankdata(table, axis=1)
|
||||
>>> ranks
|
||||
array([[1.5, 3. , 1.5],
|
||||
[1.5, 1.5, 3. ],
|
||||
[1.5, 1.5, 3. ],
|
||||
[1. , 3. , 2. ],
|
||||
[1.5, 3. , 1.5],
|
||||
[1. , 2. , 3. ],
|
||||
[1. , 2. , 3. ],
|
||||
[1. , 2.5, 2.5],
|
||||
[1. , 2. , 3. ],
|
||||
[1. , 2. , 3. ]])
|
||||
|
||||
We add the ranks within each column, multiply the sums by the
|
||||
predicted ranks, and sum the products.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> m, n = ranks.shape
|
||||
>>> predicted_ranks = np.arange(1, n+1)
|
||||
>>> L = (predicted_ranks * np.sum(ranks, axis=0)).sum()
|
||||
>>> res.statistic == L
|
||||
True
|
||||
|
||||
As presented in [3]_, the asymptotic approximation of the *p*-value is the
|
||||
survival function of the normal distribution evaluated at the standardized
|
||||
test statistic:
|
||||
|
||||
>>> from scipy.stats import norm
|
||||
>>> E0 = (m*n*(n+1)**2)/4
|
||||
>>> V0 = (m*n**2*(n+1)*(n**2-1))/144
|
||||
>>> Lambda = (L-E0)/np.sqrt(V0)
|
||||
>>> p = norm.sf(Lambda)
|
||||
>>> p
|
||||
0.0012693433690751756
|
||||
|
||||
This does not precisely match the *p*-value reported by `page_trend_test`
|
||||
above. The asymptotic distribution is not very accurate, nor conservative,
|
||||
for :math:`m \leq 12` and :math:`n \leq 8`, so `page_trend_test` chose to
|
||||
use ``method='exact'`` based on the dimensions of the table and the
|
||||
recommendations in Page's original paper [1]_. To override
|
||||
`page_trend_test`'s choice, provide the `method` argument.
|
||||
|
||||
>>> res = page_trend_test(table, method="asymptotic")
|
||||
>>> res
|
||||
PageTrendTestResult(statistic=133.5, pvalue=0.0012693433690751756,
|
||||
method='asymptotic')
|
||||
|
||||
If the data are already ranked, we can pass in the ``ranks`` instead of
|
||||
the ``table`` to save computation time.
|
||||
|
||||
>>> res = page_trend_test(ranks, # ranks of data
|
||||
... ranked=True, # data is already ranked
|
||||
... )
|
||||
>>> res
|
||||
PageTrendTestResult(statistic=133.5, pvalue=0.0018191161948127822,
|
||||
method='exact')
|
||||
|
||||
Suppose the raw data had been tabulated in an order different from the
|
||||
order of predicted means, say lecture, seminar, tutorial.
|
||||
|
||||
>>> table = np.asarray(table)[:, [1, 2, 0]]
|
||||
|
||||
Since the arrangement of this table is not consistent with the assumed
|
||||
ordering, we can either rearrange the table or provide the
|
||||
`predicted_ranks`. Remembering that the lecture is predicted
|
||||
to have the middle rank, the seminar the highest, and tutorial the lowest,
|
||||
we pass:
|
||||
|
||||
>>> res = page_trend_test(table, # data as originally tabulated
|
||||
... predicted_ranks=[2, 3, 1], # our predicted order
|
||||
... )
|
||||
>>> res
|
||||
PageTrendTestResult(statistic=133.5, pvalue=0.0018191161948127822,
|
||||
method='exact')
|
||||
|
||||
"""
|
||||
|
||||
# Possible values of the method parameter and the corresponding function
|
||||
# used to evaluate the p value
|
||||
methods = {"asymptotic": _l_p_asymptotic,
|
||||
"exact": _l_p_exact,
|
||||
"auto": None}
|
||||
if method not in methods:
|
||||
raise ValueError(f"`method` must be in {set(methods)}")
|
||||
|
||||
ranks = np.asarray(data)
|
||||
if ranks.ndim != 2: # TODO: relax this to accept 3d arrays?
|
||||
raise ValueError("`data` must be a 2d array.")
|
||||
|
||||
m, n = ranks.shape
|
||||
if m < 2 or n < 3:
|
||||
raise ValueError("Page's L is only appropriate for data with two "
|
||||
"or more rows and three or more columns.")
|
||||
|
||||
if np.any(np.isnan(data)):
|
||||
raise ValueError("`data` contains NaNs, which cannot be ranked "
|
||||
"meaningfully")
|
||||
|
||||
# ensure NumPy array and rank the data if it's not already ranked
|
||||
if ranked:
|
||||
# Only a basic check on whether data is ranked. Checking that the data
|
||||
# is properly ranked could take as much time as ranking it.
|
||||
if not (ranks.min() >= 1 and ranks.max() <= ranks.shape[1]):
|
||||
raise ValueError("`data` is not properly ranked. Rank the data or "
|
||||
"pass `ranked=False`.")
|
||||
else:
|
||||
ranks = scipy.stats.rankdata(data, axis=-1)
|
||||
|
||||
# generate predicted ranks if not provided, ensure valid NumPy array
|
||||
if predicted_ranks is None:
|
||||
predicted_ranks = np.arange(1, n+1)
|
||||
else:
|
||||
predicted_ranks = np.asarray(predicted_ranks)
|
||||
if (predicted_ranks.ndim < 1 or
|
||||
(set(predicted_ranks) != set(range(1, n+1)) or
|
||||
len(predicted_ranks) != n)):
|
||||
raise ValueError(f"`predicted_ranks` must include each integer "
|
||||
f"from 1 to {n} (the number of columns in "
|
||||
f"`data`) exactly once.")
|
||||
|
||||
if not isinstance(ranked, bool):
|
||||
raise TypeError("`ranked` must be boolean.")
|
||||
|
||||
# Calculate the L statistic
|
||||
L = _l_vectorized(ranks, predicted_ranks)
|
||||
|
||||
# Calculate the p-value
|
||||
if method == "auto":
|
||||
method = _choose_method(ranks)
|
||||
p_fun = methods[method] # get the function corresponding with the method
|
||||
p = p_fun(L, m, n)
|
||||
|
||||
page_result = PageTrendTestResult(statistic=L, pvalue=p, method=method)
|
||||
return page_result
|
||||
|
||||
|
||||
def _choose_method(ranks):
|
||||
'''Choose method for computing p-value automatically'''
|
||||
m, n = ranks.shape
|
||||
if n > 8 or (m > 12 and n > 3) or m > 20: # as in [1], [4]
|
||||
method = "asymptotic"
|
||||
else:
|
||||
method = "exact"
|
||||
return method
|
||||
|
||||
|
||||
def _l_vectorized(ranks, predicted_ranks):
|
||||
'''Calculate's Page's L statistic for each page of a 3d array'''
|
||||
colsums = ranks.sum(axis=-2, keepdims=True)
|
||||
products = predicted_ranks * colsums
|
||||
Ls = products.sum(axis=-1)
|
||||
Ls = Ls[0] if Ls.size == 1 else Ls.ravel()
|
||||
return Ls
|
||||
|
||||
|
||||
def _l_p_asymptotic(L, m, n):
|
||||
'''Calculate the p-value of Page's L from the asymptotic distribution'''
|
||||
# Using [1] as a reference, the asymptotic p-value would be calculated as:
|
||||
# chi_L = (12*L - 3*m*n*(n+1)**2)**2/(m*n**2*(n**2-1)*(n+1))
|
||||
# p = chi2.sf(chi_L, df=1, loc=0, scale=1)/2
|
||||
# but this is insensitive to the direction of the hypothesized ranking
|
||||
|
||||
# See [2] page 151
|
||||
E0 = (m*n*(n+1)**2)/4
|
||||
V0 = (m*n**2*(n+1)*(n**2-1))/144
|
||||
Lambda = (L-E0)/np.sqrt(V0)
|
||||
# This is a one-sided "greater" test - calculate the probability that the
|
||||
# L statistic under H0 would be greater than the observed L statistic
|
||||
p = norm.sf(Lambda)
|
||||
return p
|
||||
|
||||
|
||||
def _l_p_exact(L, m, n):
|
||||
'''Calculate the p-value of Page's L exactly'''
|
||||
# [1] uses m, n; [5] uses n, k.
|
||||
# Switch convention here because exact calculation code references [5].
|
||||
L, n, k = int(L), int(m), int(n)
|
||||
_pagel_state.set_k(k)
|
||||
return _pagel_state.sf(L, n)
|
||||
|
||||
|
||||
class _PageL:
|
||||
'''Maintains state between `page_trend_test` executions'''
|
||||
|
||||
def __init__(self):
|
||||
'''Lightweight initialization'''
|
||||
self.all_pmfs = {}
|
||||
|
||||
def set_k(self, k):
|
||||
'''Calculate lower and upper limits of L for single row'''
|
||||
self.k = k
|
||||
# See [5] top of page 52
|
||||
self.a, self.b = (k*(k+1)*(k+2))//6, (k*(k+1)*(2*k+1))//6
|
||||
|
||||
def sf(self, l, n):
|
||||
'''Survival function of Page's L statistic'''
|
||||
ps = [self.pmf(l, n) for l in range(l, n*self.b + 1)]
|
||||
return np.sum(ps)
|
||||
|
||||
def p_l_k_1(self):
|
||||
'''Relative frequency of each L value over all possible single rows'''
|
||||
|
||||
# See [5] Equation (6)
|
||||
ranks = range(1, self.k+1)
|
||||
# generate all possible rows of length k
|
||||
rank_perms = np.array(list(permutations(ranks)))
|
||||
# compute Page's L for all possible rows
|
||||
Ls = (ranks*rank_perms).sum(axis=1)
|
||||
# count occurrences of each L value
|
||||
counts = np.histogram(Ls, np.arange(self.a-0.5, self.b+1.5))[0]
|
||||
# factorial(k) is number of possible permutations
|
||||
return counts/math.factorial(self.k)
|
||||
|
||||
def pmf(self, l, n):
|
||||
'''Recursive function to evaluate p(l, k, n); see [5] Equation 1'''
|
||||
|
||||
if n not in self.all_pmfs:
|
||||
self.all_pmfs[n] = {}
|
||||
if self.k not in self.all_pmfs[n]:
|
||||
self.all_pmfs[n][self.k] = {}
|
||||
|
||||
# Cache results to avoid repeating calculation. Initially this was
|
||||
# written with lru_cache, but this seems faster? Also, we could add
|
||||
# an option to save this for future lookup.
|
||||
if l in self.all_pmfs[n][self.k]:
|
||||
return self.all_pmfs[n][self.k][l]
|
||||
|
||||
if n == 1:
|
||||
ps = self.p_l_k_1() # [5] Equation 6
|
||||
ls = range(self.a, self.b+1)
|
||||
# not fast, but we'll only be here once
|
||||
self.all_pmfs[n][self.k] = {l: p for l, p in zip(ls, ps)}
|
||||
return self.all_pmfs[n][self.k][l]
|
||||
|
||||
p = 0
|
||||
low = max(l-(n-1)*self.b, self.a) # [5] Equation 2
|
||||
high = min(l-(n-1)*self.a, self.b)
|
||||
|
||||
# [5] Equation 1
|
||||
for t in range(low, high+1):
|
||||
p1 = self.pmf(l-t, n-1)
|
||||
p2 = self.pmf(t, 1)
|
||||
p += p1*p2
|
||||
self.all_pmfs[n][self.k][l] = p
|
||||
return p
|
||||
|
||||
|
||||
# Maintain state for faster repeat calls to page_trend_test w/ method='exact'
|
||||
_pagel_state = _PageL()
|
||||
2786
venv/lib/python3.12/site-packages/scipy/stats/_qmc.py
Normal file
2786
venv/lib/python3.12/site-packages/scipy/stats/_qmc.py
Normal file
File diff suppressed because it is too large
Load Diff
Binary file not shown.
54
venv/lib/python3.12/site-packages/scipy/stats/_qmc_cy.pyi
Normal file
54
venv/lib/python3.12/site-packages/scipy/stats/_qmc_cy.pyi
Normal file
@ -0,0 +1,54 @@
|
||||
import numpy as np
|
||||
from scipy._lib._util import DecimalNumber, IntNumber
|
||||
|
||||
|
||||
def _cy_wrapper_centered_discrepancy(
|
||||
sample: np.ndarray,
|
||||
iterative: bool,
|
||||
workers: IntNumber,
|
||||
) -> float: ...
|
||||
|
||||
|
||||
def _cy_wrapper_wrap_around_discrepancy(
|
||||
sample: np.ndarray,
|
||||
iterative: bool,
|
||||
workers: IntNumber,
|
||||
) -> float: ...
|
||||
|
||||
|
||||
def _cy_wrapper_mixture_discrepancy(
|
||||
sample: np.ndarray,
|
||||
iterative: bool,
|
||||
workers: IntNumber,
|
||||
) -> float: ...
|
||||
|
||||
|
||||
def _cy_wrapper_l2_star_discrepancy(
|
||||
sample: np.ndarray,
|
||||
iterative: bool,
|
||||
workers: IntNumber,
|
||||
) -> float: ...
|
||||
|
||||
|
||||
def _cy_wrapper_update_discrepancy(
|
||||
x_new_view: np.ndarray,
|
||||
sample_view: np.ndarray,
|
||||
initial_disc: DecimalNumber,
|
||||
) -> float: ...
|
||||
|
||||
|
||||
def _cy_van_der_corput(
|
||||
n: IntNumber,
|
||||
base: IntNumber,
|
||||
start_index: IntNumber,
|
||||
workers: IntNumber,
|
||||
) -> np.ndarray: ...
|
||||
|
||||
|
||||
def _cy_van_der_corput_scrambled(
|
||||
n: IntNumber,
|
||||
base: IntNumber,
|
||||
start_index: IntNumber,
|
||||
permutations: np.ndarray,
|
||||
workers: IntNumber,
|
||||
) -> np.ndarray: ...
|
||||
533
venv/lib/python3.12/site-packages/scipy/stats/_qmvnt.py
Normal file
533
venv/lib/python3.12/site-packages/scipy/stats/_qmvnt.py
Normal file
@ -0,0 +1,533 @@
|
||||
# Integration of multivariate normal and t distributions.
|
||||
|
||||
# Adapted from the MATLAB original implementations by Dr. Alan Genz.
|
||||
|
||||
# http://www.math.wsu.edu/faculty/genz/software/software.html
|
||||
|
||||
# Copyright (C) 2013, Alan Genz, All rights reserved.
|
||||
# Python implementation is copyright (C) 2022, Robert Kern, All rights
|
||||
# reserved.
|
||||
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided the following conditions are met:
|
||||
# 1. Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in
|
||||
# the documentation and/or other materials provided with the
|
||||
# distribution.
|
||||
# 3. The contributor name(s) may not be used to endorse or promote
|
||||
# products derived from this software without specific prior
|
||||
# written permission.
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
||||
# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
|
||||
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
import numpy as np
|
||||
|
||||
from scipy.fft import fft, ifft
|
||||
from scipy.special import gammaincinv, ndtr, ndtri
|
||||
from scipy.stats._qmc import primes_from_2_to
|
||||
|
||||
|
||||
phi = ndtr
|
||||
phinv = ndtri
|
||||
|
||||
|
||||
def _factorize_int(n):
|
||||
"""Return a sorted list of the unique prime factors of a positive integer.
|
||||
"""
|
||||
# NOTE: There are lots faster ways to do this, but this isn't terrible.
|
||||
factors = set()
|
||||
for p in primes_from_2_to(int(np.sqrt(n)) + 1):
|
||||
while not (n % p):
|
||||
factors.add(p)
|
||||
n //= p
|
||||
if n == 1:
|
||||
break
|
||||
if n != 1:
|
||||
factors.add(n)
|
||||
return sorted(factors)
|
||||
|
||||
|
||||
def _primitive_root(p):
|
||||
"""Compute a primitive root of the prime number `p`.
|
||||
|
||||
Used in the CBC lattice construction.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] https://en.wikipedia.org/wiki/Primitive_root_modulo_n
|
||||
"""
|
||||
# p is prime
|
||||
pm = p - 1
|
||||
factors = _factorize_int(pm)
|
||||
n = len(factors)
|
||||
r = 2
|
||||
k = 0
|
||||
while k < n:
|
||||
d = pm // factors[k]
|
||||
# pow() doesn't like numpy scalar types.
|
||||
rd = pow(int(r), int(d), int(p))
|
||||
if rd == 1:
|
||||
r += 1
|
||||
k = 0
|
||||
else:
|
||||
k += 1
|
||||
return r
|
||||
|
||||
|
||||
def _cbc_lattice(n_dim, n_qmc_samples):
|
||||
"""Compute a QMC lattice generator using a Fast CBC construction.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_dim : int > 0
|
||||
The number of dimensions for the lattice.
|
||||
n_qmc_samples : int > 0
|
||||
The desired number of QMC samples. This will be rounded down to the
|
||||
nearest prime to enable the CBC construction.
|
||||
|
||||
Returns
|
||||
-------
|
||||
q : float array : shape=(n_dim,)
|
||||
The lattice generator vector. All values are in the open interval
|
||||
`(0, 1)`.
|
||||
actual_n_qmc_samples : int
|
||||
The prime number of QMC samples that must be used with this lattice,
|
||||
no more, no less.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Nuyens, D. and Cools, R. "Fast Component-by-Component Construction,
|
||||
a Reprise for Different Kernels", In H. Niederreiter and D. Talay,
|
||||
editors, Monte-Carlo and Quasi-Monte Carlo Methods 2004,
|
||||
Springer-Verlag, 2006, 371-385.
|
||||
"""
|
||||
# Round down to the nearest prime number.
|
||||
primes = primes_from_2_to(n_qmc_samples + 1)
|
||||
n_qmc_samples = primes[-1]
|
||||
|
||||
bt = np.ones(n_dim)
|
||||
gm = np.hstack([1.0, 0.8 ** np.arange(n_dim - 1)])
|
||||
q = 1
|
||||
w = 0
|
||||
z = np.arange(1, n_dim + 1)
|
||||
m = (n_qmc_samples - 1) // 2
|
||||
g = _primitive_root(n_qmc_samples)
|
||||
# Slightly faster way to compute perm[j] = pow(g, j, n_qmc_samples)
|
||||
# Shame that we don't have modulo pow() implemented as a ufunc.
|
||||
perm = np.ones(m, dtype=int)
|
||||
for j in range(m - 1):
|
||||
perm[j + 1] = (g * perm[j]) % n_qmc_samples
|
||||
perm = np.minimum(n_qmc_samples - perm, perm)
|
||||
pn = perm / n_qmc_samples
|
||||
c = pn * pn - pn + 1.0 / 6
|
||||
fc = fft(c)
|
||||
for s in range(1, n_dim):
|
||||
reordered = np.hstack([
|
||||
c[:w+1][::-1],
|
||||
c[w+1:m][::-1],
|
||||
])
|
||||
q = q * (bt[s-1] + gm[s-1] * reordered)
|
||||
w = ifft(fc * fft(q)).real.argmin()
|
||||
z[s] = perm[w]
|
||||
q = z / n_qmc_samples
|
||||
return q, n_qmc_samples
|
||||
|
||||
|
||||
# Note: this function is not currently used or tested by any SciPy code. It is
|
||||
# included in this file to facilitate the development of a parameter for users
|
||||
# to set the desired CDF accuracy, but must be reviewed and tested before use.
|
||||
def _qauto(func, covar, low, high, rng, error=1e-3, limit=10_000, **kwds):
|
||||
"""Automatically rerun the integration to get the required error bound.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : callable
|
||||
Either :func:`_qmvn` or :func:`_qmvt`.
|
||||
covar, low, high : array
|
||||
As specified in :func:`_qmvn` and :func:`_qmvt`.
|
||||
rng : Generator, optional
|
||||
default_rng(), yada, yada
|
||||
error : float > 0
|
||||
The desired error bound.
|
||||
limit : int > 0:
|
||||
The rough limit of the number of integration points to consider. The
|
||||
integration will stop looping once this limit has been *exceeded*.
|
||||
**kwds :
|
||||
Other keyword arguments to pass to `func`. When using :func:`_qmvt`, be
|
||||
sure to include ``nu=`` as one of these.
|
||||
|
||||
Returns
|
||||
-------
|
||||
prob : float
|
||||
The estimated probability mass within the bounds.
|
||||
est_error : float
|
||||
3 times the standard error of the batch estimates.
|
||||
n_samples : int
|
||||
The number of integration points actually used.
|
||||
"""
|
||||
n = len(covar)
|
||||
n_samples = 0
|
||||
if n == 1:
|
||||
prob = phi(high) - phi(low)
|
||||
# More or less
|
||||
est_error = 1e-15
|
||||
else:
|
||||
mi = min(limit, n * 1000)
|
||||
prob = 0.0
|
||||
est_error = 1.0
|
||||
ei = 0.0
|
||||
while est_error > error and n_samples < limit:
|
||||
mi = round(np.sqrt(2) * mi)
|
||||
pi, ei, ni = func(mi, covar, low, high, rng=rng, **kwds)
|
||||
n_samples += ni
|
||||
wt = 1.0 / (1 + (ei / est_error)**2)
|
||||
prob += wt * (pi - prob)
|
||||
est_error = np.sqrt(wt) * ei
|
||||
return prob, est_error, n_samples
|
||||
|
||||
|
||||
# Note: this function is not currently used or tested by any SciPy code. It is
|
||||
# included in this file to facilitate the resolution of gh-8367, gh-16142, and
|
||||
# possibly gh-14286, but must be reviewed and tested before use.
|
||||
def _qmvn(m, covar, low, high, rng, lattice='cbc', n_batches=10):
|
||||
"""Multivariate normal integration over box bounds.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
m : int > n_batches
|
||||
The number of points to sample. This number will be divided into
|
||||
`n_batches` batches that apply random offsets of the sampling lattice
|
||||
for each batch in order to estimate the error.
|
||||
covar : (n, n) float array
|
||||
Possibly singular, positive semidefinite symmetric covariance matrix.
|
||||
low, high : (n,) float array
|
||||
The low and high integration bounds.
|
||||
rng : Generator, optional
|
||||
default_rng(), yada, yada
|
||||
lattice : 'cbc' or callable
|
||||
The type of lattice rule to use to construct the integration points.
|
||||
n_batches : int > 0, optional
|
||||
The number of QMC batches to apply.
|
||||
|
||||
Returns
|
||||
-------
|
||||
prob : float
|
||||
The estimated probability mass within the bounds.
|
||||
est_error : float
|
||||
3 times the standard error of the batch estimates.
|
||||
"""
|
||||
cho, lo, hi = _permuted_cholesky(covar, low, high)
|
||||
n = cho.shape[0]
|
||||
ct = cho[0, 0]
|
||||
c = phi(lo[0] / ct)
|
||||
d = phi(hi[0] / ct)
|
||||
ci = c
|
||||
dci = d - ci
|
||||
prob = 0.0
|
||||
error_var = 0.0
|
||||
q, n_qmc_samples = _cbc_lattice(n - 1, max(m // n_batches, 1))
|
||||
y = np.zeros((n - 1, n_qmc_samples))
|
||||
i_samples = np.arange(n_qmc_samples) + 1
|
||||
for j in range(n_batches):
|
||||
c = np.full(n_qmc_samples, ci)
|
||||
dc = np.full(n_qmc_samples, dci)
|
||||
pv = dc.copy()
|
||||
for i in range(1, n):
|
||||
# Pseudorandomly-shifted lattice coordinate.
|
||||
z = q[i - 1] * i_samples + rng.random()
|
||||
# Fast remainder(z, 1.0)
|
||||
z -= z.astype(int)
|
||||
# Tent periodization transform.
|
||||
x = abs(2 * z - 1)
|
||||
y[i - 1, :] = phinv(c + x * dc)
|
||||
s = cho[i, :i] @ y[:i, :]
|
||||
ct = cho[i, i]
|
||||
c = phi((lo[i] - s) / ct)
|
||||
d = phi((hi[i] - s) / ct)
|
||||
dc = d - c
|
||||
pv = pv * dc
|
||||
# Accumulate the mean and error variances with online formulations.
|
||||
d = (pv.mean() - prob) / (j + 1)
|
||||
prob += d
|
||||
error_var = (j - 1) * error_var / (j + 1) + d * d
|
||||
# Error bounds are 3 times the standard error of the estimates.
|
||||
est_error = 3 * np.sqrt(error_var)
|
||||
n_samples = n_qmc_samples * n_batches
|
||||
return prob, est_error, n_samples
|
||||
|
||||
|
||||
# Note: this function is not currently used or tested by any SciPy code. It is
|
||||
# included in this file to facilitate the resolution of gh-8367, gh-16142, and
|
||||
# possibly gh-14286, but must be reviewed and tested before use.
|
||||
def _mvn_qmc_integrand(covar, low, high, use_tent=False):
|
||||
"""Transform the multivariate normal integration into a QMC integrand over
|
||||
a unit hypercube.
|
||||
|
||||
The dimensionality of the resulting hypercube integration domain is one
|
||||
less than the dimensionality of the original integrand. Note that this
|
||||
transformation subsumes the integration bounds in order to account for
|
||||
infinite bounds. The QMC integration one does with the returned integrand
|
||||
should be on the unit hypercube.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
covar : (n, n) float array
|
||||
Possibly singular, positive semidefinite symmetric covariance matrix.
|
||||
low, high : (n,) float array
|
||||
The low and high integration bounds.
|
||||
use_tent : bool, optional
|
||||
If True, then use tent periodization. Only helpful for lattice rules.
|
||||
|
||||
Returns
|
||||
-------
|
||||
integrand : Callable[[NDArray], NDArray]
|
||||
The QMC-integrable integrand. It takes an
|
||||
``(n_qmc_samples, ndim_integrand)`` array of QMC samples in the unit
|
||||
hypercube and returns the ``(n_qmc_samples,)`` evaluations of at these
|
||||
QMC points.
|
||||
ndim_integrand : int
|
||||
The dimensionality of the integrand. Equal to ``n-1``.
|
||||
"""
|
||||
cho, lo, hi = _permuted_cholesky(covar, low, high)
|
||||
n = cho.shape[0]
|
||||
ndim_integrand = n - 1
|
||||
ct = cho[0, 0]
|
||||
c = phi(lo[0] / ct)
|
||||
d = phi(hi[0] / ct)
|
||||
ci = c
|
||||
dci = d - ci
|
||||
|
||||
def integrand(*zs):
|
||||
ndim_qmc = len(zs)
|
||||
n_qmc_samples = len(np.atleast_1d(zs[0]))
|
||||
assert ndim_qmc == ndim_integrand
|
||||
y = np.zeros((ndim_qmc, n_qmc_samples))
|
||||
c = np.full(n_qmc_samples, ci)
|
||||
dc = np.full(n_qmc_samples, dci)
|
||||
pv = dc.copy()
|
||||
for i in range(1, n):
|
||||
if use_tent:
|
||||
# Tent periodization transform.
|
||||
x = abs(2 * zs[i-1] - 1)
|
||||
else:
|
||||
x = zs[i-1]
|
||||
y[i - 1, :] = phinv(c + x * dc)
|
||||
s = cho[i, :i] @ y[:i, :]
|
||||
ct = cho[i, i]
|
||||
c = phi((lo[i] - s) / ct)
|
||||
d = phi((hi[i] - s) / ct)
|
||||
dc = d - c
|
||||
pv = pv * dc
|
||||
return pv
|
||||
|
||||
return integrand, ndim_integrand
|
||||
|
||||
|
||||
def _qmvt(m, nu, covar, low, high, rng, lattice='cbc', n_batches=10):
|
||||
"""Multivariate t integration over box bounds.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
m : int > n_batches
|
||||
The number of points to sample. This number will be divided into
|
||||
`n_batches` batches that apply random offsets of the sampling lattice
|
||||
for each batch in order to estimate the error.
|
||||
nu : float >= 0
|
||||
The shape parameter of the multivariate t distribution.
|
||||
covar : (n, n) float array
|
||||
Possibly singular, positive semidefinite symmetric covariance matrix.
|
||||
low, high : (n,) float array
|
||||
The low and high integration bounds.
|
||||
rng : Generator, optional
|
||||
default_rng(), yada, yada
|
||||
lattice : 'cbc' or callable
|
||||
The type of lattice rule to use to construct the integration points.
|
||||
n_batches : int > 0, optional
|
||||
The number of QMC batches to apply.
|
||||
|
||||
Returns
|
||||
-------
|
||||
prob : float
|
||||
The estimated probability mass within the bounds.
|
||||
est_error : float
|
||||
3 times the standard error of the batch estimates.
|
||||
n_samples : int
|
||||
The number of samples actually used.
|
||||
"""
|
||||
sn = max(1.0, np.sqrt(nu))
|
||||
low = np.asarray(low, dtype=np.float64)
|
||||
high = np.asarray(high, dtype=np.float64)
|
||||
cho, lo, hi = _permuted_cholesky(covar, low / sn, high / sn)
|
||||
n = cho.shape[0]
|
||||
prob = 0.0
|
||||
error_var = 0.0
|
||||
q, n_qmc_samples = _cbc_lattice(n, max(m // n_batches, 1))
|
||||
i_samples = np.arange(n_qmc_samples) + 1
|
||||
for j in range(n_batches):
|
||||
pv = np.ones(n_qmc_samples)
|
||||
s = np.zeros((n, n_qmc_samples))
|
||||
for i in range(n):
|
||||
# Pseudorandomly-shifted lattice coordinate.
|
||||
z = q[i] * i_samples + rng.random()
|
||||
# Fast remainder(z, 1.0)
|
||||
z -= z.astype(int)
|
||||
# Tent periodization transform.
|
||||
x = abs(2 * z - 1)
|
||||
# FIXME: Lift the i==0 case out of the loop to make the logic
|
||||
# easier to follow.
|
||||
if i == 0:
|
||||
# We'll use one of the QR variates to pull out the
|
||||
# t-distribution scaling.
|
||||
if nu > 0:
|
||||
r = np.sqrt(2 * gammaincinv(nu / 2, x))
|
||||
else:
|
||||
r = np.ones_like(x)
|
||||
else:
|
||||
y = phinv(c + x * dc) # noqa: F821
|
||||
with np.errstate(invalid='ignore'):
|
||||
s[i:, :] += cho[i:, i - 1][:, np.newaxis] * y
|
||||
si = s[i, :]
|
||||
|
||||
c = np.ones(n_qmc_samples)
|
||||
d = np.ones(n_qmc_samples)
|
||||
with np.errstate(invalid='ignore'):
|
||||
lois = lo[i] * r - si
|
||||
hiis = hi[i] * r - si
|
||||
c[lois < -9] = 0.0
|
||||
d[hiis < -9] = 0.0
|
||||
lo_mask = abs(lois) < 9
|
||||
hi_mask = abs(hiis) < 9
|
||||
c[lo_mask] = phi(lois[lo_mask])
|
||||
d[hi_mask] = phi(hiis[hi_mask])
|
||||
|
||||
dc = d - c
|
||||
pv *= dc
|
||||
|
||||
# Accumulate the mean and error variances with online formulations.
|
||||
d = (pv.mean() - prob) / (j + 1)
|
||||
prob += d
|
||||
error_var = (j - 1) * error_var / (j + 1) + d * d
|
||||
# Error bounds are 3 times the standard error of the estimates.
|
||||
est_error = 3 * np.sqrt(error_var)
|
||||
n_samples = n_qmc_samples * n_batches
|
||||
return prob, est_error, n_samples
|
||||
|
||||
|
||||
def _permuted_cholesky(covar, low, high, tol=1e-10):
|
||||
"""Compute a scaled, permuted Cholesky factor, with integration bounds.
|
||||
|
||||
The scaling and permuting of the dimensions accomplishes part of the
|
||||
transformation of the original integration problem into a more numerically
|
||||
tractable form. The lower-triangular Cholesky factor will then be used in
|
||||
the subsequent integration. The integration bounds will be scaled and
|
||||
permuted as well.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
covar : (n, n) float array
|
||||
Possibly singular, positive semidefinite symmetric covariance matrix.
|
||||
low, high : (n,) float array
|
||||
The low and high integration bounds.
|
||||
tol : float, optional
|
||||
The singularity tolerance.
|
||||
|
||||
Returns
|
||||
-------
|
||||
cho : (n, n) float array
|
||||
Lower Cholesky factor, scaled and permuted.
|
||||
new_low, new_high : (n,) float array
|
||||
The scaled and permuted low and high integration bounds.
|
||||
"""
|
||||
# Make copies for outputting.
|
||||
cho = np.array(covar, dtype=np.float64)
|
||||
new_lo = np.array(low, dtype=np.float64)
|
||||
new_hi = np.array(high, dtype=np.float64)
|
||||
n = cho.shape[0]
|
||||
if cho.shape != (n, n):
|
||||
raise ValueError("expected a square symmetric array")
|
||||
if new_lo.shape != (n,) or new_hi.shape != (n,):
|
||||
raise ValueError(
|
||||
"expected integration boundaries the same dimensions "
|
||||
"as the covariance matrix"
|
||||
)
|
||||
# Scale by the sqrt of the diagonal.
|
||||
dc = np.sqrt(np.maximum(np.diag(cho), 0.0))
|
||||
# But don't divide by 0.
|
||||
dc[dc == 0.0] = 1.0
|
||||
new_lo /= dc
|
||||
new_hi /= dc
|
||||
cho /= dc
|
||||
cho /= dc[:, np.newaxis]
|
||||
|
||||
y = np.zeros(n)
|
||||
sqtp = np.sqrt(2 * np.pi)
|
||||
for k in range(n):
|
||||
epk = (k + 1) * tol
|
||||
im = k
|
||||
ck = 0.0
|
||||
dem = 1.0
|
||||
s = 0.0
|
||||
lo_m = 0.0
|
||||
hi_m = 0.0
|
||||
for i in range(k, n):
|
||||
if cho[i, i] > tol:
|
||||
ci = np.sqrt(cho[i, i])
|
||||
if i > 0:
|
||||
s = cho[i, :k] @ y[:k]
|
||||
lo_i = (new_lo[i] - s) / ci
|
||||
hi_i = (new_hi[i] - s) / ci
|
||||
de = phi(hi_i) - phi(lo_i)
|
||||
if de <= dem:
|
||||
ck = ci
|
||||
dem = de
|
||||
lo_m = lo_i
|
||||
hi_m = hi_i
|
||||
im = i
|
||||
if im > k:
|
||||
# Swap im and k
|
||||
cho[im, im] = cho[k, k]
|
||||
_swap_slices(cho, np.s_[im, :k], np.s_[k, :k])
|
||||
_swap_slices(cho, np.s_[im + 1:, im], np.s_[im + 1:, k])
|
||||
_swap_slices(cho, np.s_[k + 1:im, k], np.s_[im, k + 1:im])
|
||||
_swap_slices(new_lo, k, im)
|
||||
_swap_slices(new_hi, k, im)
|
||||
if ck > epk:
|
||||
cho[k, k] = ck
|
||||
cho[k, k + 1:] = 0.0
|
||||
for i in range(k + 1, n):
|
||||
cho[i, k] /= ck
|
||||
cho[i, k + 1:i + 1] -= cho[i, k] * cho[k + 1:i + 1, k]
|
||||
if abs(dem) > tol:
|
||||
y[k] = ((np.exp(-lo_m * lo_m / 2) - np.exp(-hi_m * hi_m / 2)) /
|
||||
(sqtp * dem))
|
||||
else:
|
||||
y[k] = (lo_m + hi_m) / 2
|
||||
if lo_m < -10:
|
||||
y[k] = hi_m
|
||||
elif hi_m > 10:
|
||||
y[k] = lo_m
|
||||
cho[k, :k + 1] /= ck
|
||||
new_lo[k] /= ck
|
||||
new_hi[k] /= ck
|
||||
else:
|
||||
cho[k:, k] = 0.0
|
||||
y[k] = (new_lo[k] + new_hi[k]) / 2
|
||||
return cho, new_lo, new_hi
|
||||
|
||||
|
||||
def _swap_slices(x, slc1, slc2):
|
||||
t = x[slc1].copy()
|
||||
x[slc1] = x[slc2].copy()
|
||||
x[slc2] = t
|
||||
@ -0,0 +1,4 @@
|
||||
#
|
||||
from .rcont import rvs_rcont1, rvs_rcont2
|
||||
|
||||
__all__ = ["rvs_rcont1", "rvs_rcont2"]
|
||||
Binary file not shown.
263
venv/lib/python3.12/site-packages/scipy/stats/_relative_risk.py
Normal file
263
venv/lib/python3.12/site-packages/scipy/stats/_relative_risk.py
Normal file
@ -0,0 +1,263 @@
|
||||
import operator
|
||||
from dataclasses import dataclass
|
||||
import numpy as np
|
||||
from scipy.special import ndtri
|
||||
from ._common import ConfidenceInterval
|
||||
|
||||
|
||||
def _validate_int(n, bound, name):
|
||||
msg = f'{name} must be an integer not less than {bound}, but got {n!r}'
|
||||
try:
|
||||
n = operator.index(n)
|
||||
except TypeError:
|
||||
raise TypeError(msg) from None
|
||||
if n < bound:
|
||||
raise ValueError(msg)
|
||||
return n
|
||||
|
||||
|
||||
@dataclass
|
||||
class RelativeRiskResult:
|
||||
"""
|
||||
Result of `scipy.stats.contingency.relative_risk`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
relative_risk : float
|
||||
This is::
|
||||
|
||||
(exposed_cases/exposed_total) / (control_cases/control_total)
|
||||
|
||||
exposed_cases : int
|
||||
The number of "cases" (i.e. occurrence of disease or other event
|
||||
of interest) among the sample of "exposed" individuals.
|
||||
exposed_total : int
|
||||
The total number of "exposed" individuals in the sample.
|
||||
control_cases : int
|
||||
The number of "cases" among the sample of "control" or non-exposed
|
||||
individuals.
|
||||
control_total : int
|
||||
The total number of "control" individuals in the sample.
|
||||
|
||||
Methods
|
||||
-------
|
||||
confidence_interval :
|
||||
Compute the confidence interval for the relative risk estimate.
|
||||
"""
|
||||
|
||||
relative_risk: float
|
||||
exposed_cases: int
|
||||
exposed_total: int
|
||||
control_cases: int
|
||||
control_total: int
|
||||
|
||||
def confidence_interval(self, confidence_level=0.95):
|
||||
"""
|
||||
Compute the confidence interval for the relative risk.
|
||||
|
||||
The confidence interval is computed using the Katz method
|
||||
(i.e. "Method C" of [1]_; see also [2]_, section 3.1.2).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
confidence_level : float, optional
|
||||
The confidence level to use for the confidence interval.
|
||||
Default is 0.95.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ci : ConfidenceInterval instance
|
||||
The return value is an object with attributes ``low`` and
|
||||
``high`` that hold the confidence interval.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] D. Katz, J. Baptista, S. P. Azen and M. C. Pike, "Obtaining
|
||||
confidence intervals for the risk ratio in cohort studies",
|
||||
Biometrics, 34, 469-474 (1978).
|
||||
.. [2] Hardeo Sahai and Anwer Khurshid, Statistics in Epidemiology,
|
||||
CRC Press LLC, Boca Raton, FL, USA (1996).
|
||||
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.stats.contingency import relative_risk
|
||||
>>> result = relative_risk(exposed_cases=10, exposed_total=75,
|
||||
... control_cases=12, control_total=225)
|
||||
>>> result.relative_risk
|
||||
2.5
|
||||
>>> result.confidence_interval()
|
||||
ConfidenceInterval(low=1.1261564003469628, high=5.549850800541033)
|
||||
"""
|
||||
if not 0 <= confidence_level <= 1:
|
||||
raise ValueError('confidence_level must be in the interval '
|
||||
'[0, 1].')
|
||||
|
||||
# Handle edge cases where either exposed_cases or control_cases
|
||||
# is zero. We follow the convention of the R function riskratio
|
||||
# from the epitools library.
|
||||
if self.exposed_cases == 0 and self.control_cases == 0:
|
||||
# relative risk is nan.
|
||||
return ConfidenceInterval(low=np.nan, high=np.nan)
|
||||
elif self.exposed_cases == 0:
|
||||
# relative risk is 0.
|
||||
return ConfidenceInterval(low=0.0, high=np.nan)
|
||||
elif self.control_cases == 0:
|
||||
# relative risk is inf
|
||||
return ConfidenceInterval(low=np.nan, high=np.inf)
|
||||
|
||||
alpha = 1 - confidence_level
|
||||
z = ndtri(1 - alpha/2)
|
||||
rr = self.relative_risk
|
||||
|
||||
# Estimate of the variance of log(rr) is
|
||||
# var(log(rr)) = 1/exposed_cases - 1/exposed_total +
|
||||
# 1/control_cases - 1/control_total
|
||||
# and the standard error is the square root of that.
|
||||
se = np.sqrt(1/self.exposed_cases - 1/self.exposed_total +
|
||||
1/self.control_cases - 1/self.control_total)
|
||||
delta = z*se
|
||||
katz_lo = rr*np.exp(-delta)
|
||||
katz_hi = rr*np.exp(delta)
|
||||
return ConfidenceInterval(low=katz_lo, high=katz_hi)
|
||||
|
||||
|
||||
def relative_risk(exposed_cases, exposed_total, control_cases, control_total):
|
||||
"""
|
||||
Compute the relative risk (also known as the risk ratio).
|
||||
|
||||
This function computes the relative risk associated with a 2x2
|
||||
contingency table ([1]_, section 2.2.3; [2]_, section 3.1.2). Instead
|
||||
of accepting a table as an argument, the individual numbers that are
|
||||
used to compute the relative risk are given as separate parameters.
|
||||
This is to avoid the ambiguity of which row or column of the contingency
|
||||
table corresponds to the "exposed" cases and which corresponds to the
|
||||
"control" cases. Unlike, say, the odds ratio, the relative risk is not
|
||||
invariant under an interchange of the rows or columns.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
exposed_cases : nonnegative int
|
||||
The number of "cases" (i.e. occurrence of disease or other event
|
||||
of interest) among the sample of "exposed" individuals.
|
||||
exposed_total : positive int
|
||||
The total number of "exposed" individuals in the sample.
|
||||
control_cases : nonnegative int
|
||||
The number of "cases" among the sample of "control" or non-exposed
|
||||
individuals.
|
||||
control_total : positive int
|
||||
The total number of "control" individuals in the sample.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : instance of `~scipy.stats._result_classes.RelativeRiskResult`
|
||||
The object has the float attribute ``relative_risk``, which is::
|
||||
|
||||
rr = (exposed_cases/exposed_total) / (control_cases/control_total)
|
||||
|
||||
The object also has the method ``confidence_interval`` to compute
|
||||
the confidence interval of the relative risk for a given confidence
|
||||
level.
|
||||
|
||||
See Also
|
||||
--------
|
||||
odds_ratio
|
||||
|
||||
Notes
|
||||
-----
|
||||
The R package epitools has the function `riskratio`, which accepts
|
||||
a table with the following layout::
|
||||
|
||||
disease=0 disease=1
|
||||
exposed=0 (ref) n00 n01
|
||||
exposed=1 n10 n11
|
||||
|
||||
With a 2x2 table in the above format, the estimate of the CI is
|
||||
computed by `riskratio` when the argument method="wald" is given,
|
||||
or with the function `riskratio.wald`.
|
||||
|
||||
For example, in a test of the incidence of lung cancer among a
|
||||
sample of smokers and nonsmokers, the "exposed" category would
|
||||
correspond to "is a smoker" and the "disease" category would
|
||||
correspond to "has or had lung cancer".
|
||||
|
||||
To pass the same data to ``relative_risk``, use::
|
||||
|
||||
relative_risk(n11, n10 + n11, n01, n00 + n01)
|
||||
|
||||
.. versionadded:: 1.7.0
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Alan Agresti, An Introduction to Categorical Data Analysis
|
||||
(second edition), Wiley, Hoboken, NJ, USA (2007).
|
||||
.. [2] Hardeo Sahai and Anwer Khurshid, Statistics in Epidemiology,
|
||||
CRC Press LLC, Boca Raton, FL, USA (1996).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.stats.contingency import relative_risk
|
||||
|
||||
This example is from Example 3.1 of [2]_. The results of a heart
|
||||
disease study are summarized in the following table::
|
||||
|
||||
High CAT Low CAT Total
|
||||
-------- ------- -----
|
||||
CHD 27 44 71
|
||||
No CHD 95 443 538
|
||||
|
||||
Total 122 487 609
|
||||
|
||||
CHD is coronary heart disease, and CAT refers to the level of
|
||||
circulating catecholamine. CAT is the "exposure" variable, and
|
||||
high CAT is the "exposed" category. So the data from the table
|
||||
to be passed to ``relative_risk`` is::
|
||||
|
||||
exposed_cases = 27
|
||||
exposed_total = 122
|
||||
control_cases = 44
|
||||
control_total = 487
|
||||
|
||||
>>> result = relative_risk(27, 122, 44, 487)
|
||||
>>> result.relative_risk
|
||||
2.4495156482861398
|
||||
|
||||
Find the confidence interval for the relative risk.
|
||||
|
||||
>>> result.confidence_interval(confidence_level=0.95)
|
||||
ConfidenceInterval(low=1.5836990926700116, high=3.7886786315466354)
|
||||
|
||||
The interval does not contain 1, so the data supports the statement
|
||||
that high CAT is associated with greater risk of CHD.
|
||||
"""
|
||||
# Relative risk is a trivial calculation. The nontrivial part is in the
|
||||
# `confidence_interval` method of the RelativeRiskResult class.
|
||||
|
||||
exposed_cases = _validate_int(exposed_cases, 0, "exposed_cases")
|
||||
exposed_total = _validate_int(exposed_total, 1, "exposed_total")
|
||||
control_cases = _validate_int(control_cases, 0, "control_cases")
|
||||
control_total = _validate_int(control_total, 1, "control_total")
|
||||
|
||||
if exposed_cases > exposed_total:
|
||||
raise ValueError('exposed_cases must not exceed exposed_total.')
|
||||
if control_cases > control_total:
|
||||
raise ValueError('control_cases must not exceed control_total.')
|
||||
|
||||
if exposed_cases == 0 and control_cases == 0:
|
||||
# relative risk is 0/0.
|
||||
rr = np.nan
|
||||
elif exposed_cases == 0:
|
||||
# relative risk is 0/nonzero
|
||||
rr = 0.0
|
||||
elif control_cases == 0:
|
||||
# relative risk is nonzero/0.
|
||||
rr = np.inf
|
||||
else:
|
||||
p1 = exposed_cases / exposed_total
|
||||
p2 = control_cases / control_total
|
||||
rr = p1 / p2
|
||||
return RelativeRiskResult(relative_risk=rr,
|
||||
exposed_cases=exposed_cases,
|
||||
exposed_total=exposed_total,
|
||||
control_cases=control_cases,
|
||||
control_total=control_total)
|
||||
2255
venv/lib/python3.12/site-packages/scipy/stats/_resampling.py
Normal file
2255
venv/lib/python3.12/site-packages/scipy/stats/_resampling.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,40 @@
|
||||
# This module exists only to allow Sphinx to generate docs
|
||||
# for the result objects returned by some functions in stats
|
||||
# _without_ adding them to the main stats documentation page.
|
||||
|
||||
"""
|
||||
Result classes
|
||||
--------------
|
||||
|
||||
.. currentmodule:: scipy.stats._result_classes
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
RelativeRiskResult
|
||||
BinomTestResult
|
||||
TukeyHSDResult
|
||||
DunnettResult
|
||||
PearsonRResult
|
||||
FitResult
|
||||
OddsRatioResult
|
||||
TtestResult
|
||||
ECDFResult
|
||||
EmpiricalDistributionFunction
|
||||
|
||||
"""
|
||||
|
||||
__all__ = ['BinomTestResult', 'RelativeRiskResult', 'TukeyHSDResult',
|
||||
'PearsonRResult', 'FitResult', 'OddsRatioResult',
|
||||
'TtestResult', 'DunnettResult', 'ECDFResult',
|
||||
'EmpiricalDistributionFunction']
|
||||
|
||||
|
||||
from ._binomtest import BinomTestResult
|
||||
from ._odds_ratio import OddsRatioResult
|
||||
from ._relative_risk import RelativeRiskResult
|
||||
from ._hypotests import TukeyHSDResult
|
||||
from ._multicomp import DunnettResult
|
||||
from ._stats_py import PearsonRResult, TtestResult
|
||||
from ._fit import FitResult
|
||||
from ._survival import ECDFResult, EmpiricalDistributionFunction
|
||||
@ -0,0 +1,56 @@
|
||||
import warnings
|
||||
from scipy.stats.sampling import RatioUniforms
|
||||
|
||||
def rvs_ratio_uniforms(pdf, umax, vmin, vmax, size=1, c=0, random_state=None):
|
||||
"""
|
||||
Generate random samples from a probability density function using the
|
||||
ratio-of-uniforms method.
|
||||
|
||||
.. deprecated:: 1.12.0
|
||||
`rvs_ratio_uniforms` is deprecated in favour of
|
||||
`scipy.stats.sampling.RatioUniforms` from version 1.12.0 and will
|
||||
be removed in SciPy 1.15.0
|
||||
|
||||
Parameters
|
||||
----------
|
||||
pdf : callable
|
||||
A function with signature `pdf(x)` that is proportional to the
|
||||
probability density function of the distribution.
|
||||
umax : float
|
||||
The upper bound of the bounding rectangle in the u-direction.
|
||||
vmin : float
|
||||
The lower bound of the bounding rectangle in the v-direction.
|
||||
vmax : float
|
||||
The upper bound of the bounding rectangle in the v-direction.
|
||||
size : int or tuple of ints, optional
|
||||
Defining number of random variates (default is 1).
|
||||
c : float, optional.
|
||||
Shift parameter of ratio-of-uniforms method, see Notes. Default is 0.
|
||||
random_state : {None, int, `numpy.random.Generator`,
|
||||
`numpy.random.RandomState`}, optional
|
||||
|
||||
If `seed` is None (or `np.random`), the `numpy.random.RandomState`
|
||||
singleton is used.
|
||||
If `seed` is an int, a new ``RandomState`` instance is used,
|
||||
seeded with `seed`.
|
||||
If `seed` is already a ``Generator`` or ``RandomState`` instance then
|
||||
that instance is used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
rvs : ndarray
|
||||
The random variates distributed according to the probability
|
||||
distribution defined by the pdf.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Please refer to `scipy.stats.sampling.RatioUniforms` for the documentation.
|
||||
"""
|
||||
warnings.warn("Please use `RatioUniforms` from the "
|
||||
"`scipy.stats.sampling` namespace. The "
|
||||
"`scipy.stats.rvs_ratio_uniforms` namespace is deprecated "
|
||||
"and will be removed in SciPy 1.15.0",
|
||||
category=DeprecationWarning, stacklevel=2)
|
||||
gen = RatioUniforms(pdf, umax=umax, vmin=vmin, vmax=vmax,
|
||||
c=c, random_state=random_state)
|
||||
return gen.rvs(size)
|
||||
1314
venv/lib/python3.12/site-packages/scipy/stats/_sampling.py
Normal file
1314
venv/lib/python3.12/site-packages/scipy/stats/_sampling.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,712 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import inspect
|
||||
from dataclasses import dataclass
|
||||
from typing import (
|
||||
Callable, Literal, Protocol, TYPE_CHECKING
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
|
||||
from scipy.stats._common import ConfidenceInterval
|
||||
from scipy.stats._qmc import check_random_state
|
||||
from scipy.stats._resampling import BootstrapResult
|
||||
from scipy.stats import qmc, bootstrap
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import numpy.typing as npt
|
||||
from scipy._lib._util import DecimalNumber, IntNumber, SeedType
|
||||
|
||||
|
||||
__all__ = [
|
||||
'sobol_indices'
|
||||
]
|
||||
|
||||
|
||||
def f_ishigami(x: npt.ArrayLike) -> np.ndarray:
|
||||
r"""Ishigami function.
|
||||
|
||||
.. math::
|
||||
|
||||
Y(\mathbf{x}) = \sin x_1 + 7 \sin^2 x_2 + 0.1 x_3^4 \sin x_1
|
||||
|
||||
with :math:`\mathbf{x} \in [-\pi, \pi]^3`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : array_like ([x1, x2, x3], n)
|
||||
|
||||
Returns
|
||||
-------
|
||||
f : array_like (n,)
|
||||
Function evaluation.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Ishigami, T. and T. Homma. "An importance quantification technique
|
||||
in uncertainty analysis for computer models." IEEE,
|
||||
:doi:`10.1109/ISUMA.1990.151285`, 1990.
|
||||
"""
|
||||
x = np.atleast_2d(x)
|
||||
f_eval = (
|
||||
np.sin(x[0])
|
||||
+ 7 * np.sin(x[1])**2
|
||||
+ 0.1 * (x[2]**4) * np.sin(x[0])
|
||||
)
|
||||
return f_eval
|
||||
|
||||
|
||||
def sample_A_B(
|
||||
n: IntNumber,
|
||||
dists: list[PPFDist],
|
||||
random_state: SeedType = None
|
||||
) -> np.ndarray:
|
||||
"""Sample two matrices A and B.
|
||||
|
||||
Uses a Sobol' sequence with 2`d` columns to have 2 uncorrelated matrices.
|
||||
This is more efficient than using 2 random draw of Sobol'.
|
||||
See sec. 5 from [1]_.
|
||||
|
||||
Output shape is (d, n).
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Saltelli, A., P. Annoni, I. Azzini, F. Campolongo, M. Ratto, and
|
||||
S. Tarantola. "Variance based sensitivity analysis of model
|
||||
output. Design and estimator for the total sensitivity index."
|
||||
Computer Physics Communications, 181(2):259-270,
|
||||
:doi:`10.1016/j.cpc.2009.09.018`, 2010.
|
||||
"""
|
||||
d = len(dists)
|
||||
A_B = qmc.Sobol(d=2*d, seed=random_state, bits=64).random(n).T
|
||||
A_B = A_B.reshape(2, d, -1)
|
||||
try:
|
||||
for d_, dist in enumerate(dists):
|
||||
A_B[:, d_] = dist.ppf(A_B[:, d_])
|
||||
except AttributeError as exc:
|
||||
message = "Each distribution in `dists` must have method `ppf`."
|
||||
raise ValueError(message) from exc
|
||||
return A_B
|
||||
|
||||
|
||||
def sample_AB(A: np.ndarray, B: np.ndarray) -> np.ndarray:
|
||||
"""AB matrix.
|
||||
|
||||
AB: rows of B into A. Shape (d, d, n).
|
||||
- Copy A into d "pages"
|
||||
- In the first page, replace 1st rows of A with 1st row of B.
|
||||
...
|
||||
- In the dth page, replace dth row of A with dth row of B.
|
||||
- return the stack of pages
|
||||
"""
|
||||
d, n = A.shape
|
||||
AB = np.tile(A, (d, 1, 1))
|
||||
i = np.arange(d)
|
||||
AB[i, i] = B[i]
|
||||
return AB
|
||||
|
||||
|
||||
def saltelli_2010(
|
||||
f_A: np.ndarray, f_B: np.ndarray, f_AB: np.ndarray
|
||||
) -> tuple[np.ndarray, np.ndarray]:
|
||||
r"""Saltelli2010 formulation.
|
||||
|
||||
.. math::
|
||||
|
||||
S_i = \frac{1}{N} \sum_{j=1}^N
|
||||
f(\mathbf{B})_j (f(\mathbf{AB}^{(i)})_j - f(\mathbf{A})_j)
|
||||
|
||||
.. math::
|
||||
|
||||
S_{T_i} = \frac{1}{N} \sum_{j=1}^N
|
||||
(f(\mathbf{A})_j - f(\mathbf{AB}^{(i)})_j)^2
|
||||
|
||||
Parameters
|
||||
----------
|
||||
f_A, f_B : array_like (s, n)
|
||||
Function values at A and B, respectively
|
||||
f_AB : array_like (d, s, n)
|
||||
Function values at each of the AB pages
|
||||
|
||||
Returns
|
||||
-------
|
||||
s, st : array_like (s, d)
|
||||
First order and total order Sobol' indices.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Saltelli, A., P. Annoni, I. Azzini, F. Campolongo, M. Ratto, and
|
||||
S. Tarantola. "Variance based sensitivity analysis of model
|
||||
output. Design and estimator for the total sensitivity index."
|
||||
Computer Physics Communications, 181(2):259-270,
|
||||
:doi:`10.1016/j.cpc.2009.09.018`, 2010.
|
||||
"""
|
||||
# Empirical variance calculated using output from A and B which are
|
||||
# independent. Output of AB is not independent and cannot be used
|
||||
var = np.var([f_A, f_B], axis=(0, -1))
|
||||
|
||||
# We divide by the variance to have a ratio of variance
|
||||
# this leads to eq. 2
|
||||
s = np.mean(f_B * (f_AB - f_A), axis=-1) / var # Table 2 (b)
|
||||
st = 0.5 * np.mean((f_A - f_AB) ** 2, axis=-1) / var # Table 2 (f)
|
||||
|
||||
return s.T, st.T
|
||||
|
||||
|
||||
@dataclass
|
||||
class BootstrapSobolResult:
|
||||
first_order: BootstrapResult
|
||||
total_order: BootstrapResult
|
||||
|
||||
|
||||
@dataclass
|
||||
class SobolResult:
|
||||
first_order: np.ndarray
|
||||
total_order: np.ndarray
|
||||
_indices_method: Callable
|
||||
_f_A: np.ndarray
|
||||
_f_B: np.ndarray
|
||||
_f_AB: np.ndarray
|
||||
_A: np.ndarray | None = None
|
||||
_B: np.ndarray | None = None
|
||||
_AB: np.ndarray | None = None
|
||||
_bootstrap_result: BootstrapResult | None = None
|
||||
|
||||
def bootstrap(
|
||||
self,
|
||||
confidence_level: DecimalNumber = 0.95,
|
||||
n_resamples: IntNumber = 999
|
||||
) -> BootstrapSobolResult:
|
||||
"""Bootstrap Sobol' indices to provide confidence intervals.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
confidence_level : float, default: ``0.95``
|
||||
The confidence level of the confidence intervals.
|
||||
n_resamples : int, default: ``999``
|
||||
The number of resamples performed to form the bootstrap
|
||||
distribution of the indices.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : BootstrapSobolResult
|
||||
Bootstrap result containing the confidence intervals and the
|
||||
bootstrap distribution of the indices.
|
||||
|
||||
An object with attributes:
|
||||
|
||||
first_order : BootstrapResult
|
||||
Bootstrap result of the first order indices.
|
||||
total_order : BootstrapResult
|
||||
Bootstrap result of the total order indices.
|
||||
See `BootstrapResult` for more details.
|
||||
|
||||
"""
|
||||
def statistic(idx):
|
||||
f_A_ = self._f_A[:, idx]
|
||||
f_B_ = self._f_B[:, idx]
|
||||
f_AB_ = self._f_AB[..., idx]
|
||||
return self._indices_method(f_A_, f_B_, f_AB_)
|
||||
|
||||
n = self._f_A.shape[1]
|
||||
|
||||
res = bootstrap(
|
||||
[np.arange(n)], statistic=statistic, method="BCa",
|
||||
n_resamples=n_resamples,
|
||||
confidence_level=confidence_level,
|
||||
bootstrap_result=self._bootstrap_result
|
||||
)
|
||||
self._bootstrap_result = res
|
||||
|
||||
first_order = BootstrapResult(
|
||||
confidence_interval=ConfidenceInterval(
|
||||
res.confidence_interval.low[0], res.confidence_interval.high[0]
|
||||
),
|
||||
bootstrap_distribution=res.bootstrap_distribution[0],
|
||||
standard_error=res.standard_error[0],
|
||||
)
|
||||
total_order = BootstrapResult(
|
||||
confidence_interval=ConfidenceInterval(
|
||||
res.confidence_interval.low[1], res.confidence_interval.high[1]
|
||||
),
|
||||
bootstrap_distribution=res.bootstrap_distribution[1],
|
||||
standard_error=res.standard_error[1],
|
||||
)
|
||||
|
||||
return BootstrapSobolResult(
|
||||
first_order=first_order, total_order=total_order
|
||||
)
|
||||
|
||||
|
||||
class PPFDist(Protocol):
|
||||
@property
|
||||
def ppf(self) -> Callable[..., float]:
|
||||
...
|
||||
|
||||
|
||||
def sobol_indices(
|
||||
*,
|
||||
func: Callable[[np.ndarray], npt.ArrayLike] |
|
||||
dict[Literal['f_A', 'f_B', 'f_AB'], np.ndarray],
|
||||
n: IntNumber,
|
||||
dists: list[PPFDist] | None = None,
|
||||
method: Callable | Literal['saltelli_2010'] = 'saltelli_2010',
|
||||
random_state: SeedType = None
|
||||
) -> SobolResult:
|
||||
r"""Global sensitivity indices of Sobol'.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : callable or dict(str, array_like)
|
||||
If `func` is a callable, function to compute the Sobol' indices from.
|
||||
Its signature must be::
|
||||
|
||||
func(x: ArrayLike) -> ArrayLike
|
||||
|
||||
with ``x`` of shape ``(d, n)`` and output of shape ``(s, n)`` where:
|
||||
|
||||
- ``d`` is the input dimensionality of `func`
|
||||
(number of input variables),
|
||||
- ``s`` is the output dimensionality of `func`
|
||||
(number of output variables), and
|
||||
- ``n`` is the number of samples (see `n` below).
|
||||
|
||||
Function evaluation values must be finite.
|
||||
|
||||
If `func` is a dictionary, contains the function evaluations from three
|
||||
different arrays. Keys must be: ``f_A``, ``f_B`` and ``f_AB``.
|
||||
``f_A`` and ``f_B`` should have a shape ``(s, n)`` and ``f_AB``
|
||||
should have a shape ``(d, s, n)``.
|
||||
This is an advanced feature and misuse can lead to wrong analysis.
|
||||
n : int
|
||||
Number of samples used to generate the matrices ``A`` and ``B``.
|
||||
Must be a power of 2. The total number of points at which `func` is
|
||||
evaluated will be ``n*(d+2)``.
|
||||
dists : list(distributions), optional
|
||||
List of each parameter's distribution. The distribution of parameters
|
||||
depends on the application and should be carefully chosen.
|
||||
Parameters are assumed to be independently distributed, meaning there
|
||||
is no constraint nor relationship between their values.
|
||||
|
||||
Distributions must be an instance of a class with a ``ppf``
|
||||
method.
|
||||
|
||||
Must be specified if `func` is a callable, and ignored otherwise.
|
||||
method : Callable or str, default: 'saltelli_2010'
|
||||
Method used to compute the first and total Sobol' indices.
|
||||
|
||||
If a callable, its signature must be::
|
||||
|
||||
func(f_A: np.ndarray, f_B: np.ndarray, f_AB: np.ndarray)
|
||||
-> Tuple[np.ndarray, np.ndarray]
|
||||
|
||||
with ``f_A, f_B`` of shape ``(s, n)`` and ``f_AB`` of shape
|
||||
``(d, s, n)``.
|
||||
These arrays contain the function evaluations from three different sets
|
||||
of samples.
|
||||
The output is a tuple of the first and total indices with
|
||||
shape ``(s, d)``.
|
||||
This is an advanced feature and misuse can lead to wrong analysis.
|
||||
random_state : {None, int, `numpy.random.Generator`}, optional
|
||||
If `random_state` is an int or None, a new `numpy.random.Generator` is
|
||||
created using ``np.random.default_rng(random_state)``.
|
||||
If `random_state` is already a ``Generator`` instance, then the
|
||||
provided instance is used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : SobolResult
|
||||
An object with attributes:
|
||||
|
||||
first_order : ndarray of shape (s, d)
|
||||
First order Sobol' indices.
|
||||
total_order : ndarray of shape (s, d)
|
||||
Total order Sobol' indices.
|
||||
|
||||
And method:
|
||||
|
||||
bootstrap(confidence_level: float, n_resamples: int)
|
||||
-> BootstrapSobolResult
|
||||
|
||||
A method providing confidence intervals on the indices.
|
||||
See `scipy.stats.bootstrap` for more details.
|
||||
|
||||
The bootstrapping is done on both first and total order indices,
|
||||
and they are available in `BootstrapSobolResult` as attributes
|
||||
``first_order`` and ``total_order``.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The Sobol' method [1]_, [2]_ is a variance-based Sensitivity Analysis which
|
||||
obtains the contribution of each parameter to the variance of the
|
||||
quantities of interest (QoIs; i.e., the outputs of `func`).
|
||||
Respective contributions can be used to rank the parameters and
|
||||
also gauge the complexity of the model by computing the
|
||||
model's effective (or mean) dimension.
|
||||
|
||||
.. note::
|
||||
|
||||
Parameters are assumed to be independently distributed. Each
|
||||
parameter can still follow any distribution. In fact, the distribution
|
||||
is very important and should match the real distribution of the
|
||||
parameters.
|
||||
|
||||
It uses a functional decomposition of the variance of the function to
|
||||
explore
|
||||
|
||||
.. math::
|
||||
|
||||
\mathbb{V}(Y) = \sum_{i}^{d} \mathbb{V}_i (Y) + \sum_{i<j}^{d}
|
||||
\mathbb{V}_{ij}(Y) + ... + \mathbb{V}_{1,2,...,d}(Y),
|
||||
|
||||
introducing conditional variances:
|
||||
|
||||
.. math::
|
||||
|
||||
\mathbb{V}_i(Y) = \mathbb{\mathbb{V}}[\mathbb{E}(Y|x_i)]
|
||||
\qquad
|
||||
\mathbb{V}_{ij}(Y) = \mathbb{\mathbb{V}}[\mathbb{E}(Y|x_i x_j)]
|
||||
- \mathbb{V}_i(Y) - \mathbb{V}_j(Y),
|
||||
|
||||
Sobol' indices are expressed as
|
||||
|
||||
.. math::
|
||||
|
||||
S_i = \frac{\mathbb{V}_i(Y)}{\mathbb{V}[Y]}
|
||||
\qquad
|
||||
S_{ij} =\frac{\mathbb{V}_{ij}(Y)}{\mathbb{V}[Y]}.
|
||||
|
||||
:math:`S_{i}` corresponds to the first-order term which apprises the
|
||||
contribution of the i-th parameter, while :math:`S_{ij}` corresponds to the
|
||||
second-order term which informs about the contribution of interactions
|
||||
between the i-th and the j-th parameters. These equations can be
|
||||
generalized to compute higher order terms; however, they are expensive to
|
||||
compute and their interpretation is complex.
|
||||
This is why only first order indices are provided.
|
||||
|
||||
Total order indices represent the global contribution of the parameters
|
||||
to the variance of the QoI and are defined as:
|
||||
|
||||
.. math::
|
||||
|
||||
S_{T_i} = S_i + \sum_j S_{ij} + \sum_{j,k} S_{ijk} + ...
|
||||
= 1 - \frac{\mathbb{V}[\mathbb{E}(Y|x_{\sim i})]}{\mathbb{V}[Y]}.
|
||||
|
||||
First order indices sum to at most 1, while total order indices sum to at
|
||||
least 1. If there are no interactions, then first and total order indices
|
||||
are equal, and both first and total order indices sum to 1.
|
||||
|
||||
.. warning::
|
||||
|
||||
Negative Sobol' values are due to numerical errors. Increasing the
|
||||
number of points `n` should help.
|
||||
|
||||
The number of sample required to have a good analysis increases with
|
||||
the dimensionality of the problem. e.g. for a 3 dimension problem,
|
||||
consider at minima ``n >= 2**12``. The more complex the model is,
|
||||
the more samples will be needed.
|
||||
|
||||
Even for a purely addiditive model, the indices may not sum to 1 due
|
||||
to numerical noise.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Sobol, I. M.. "Sensitivity analysis for nonlinear mathematical
|
||||
models." Mathematical Modeling and Computational Experiment, 1:407-414,
|
||||
1993.
|
||||
.. [2] Sobol, I. M. (2001). "Global sensitivity indices for nonlinear
|
||||
mathematical models and their Monte Carlo estimates." Mathematics
|
||||
and Computers in Simulation, 55(1-3):271-280,
|
||||
:doi:`10.1016/S0378-4754(00)00270-6`, 2001.
|
||||
.. [3] Saltelli, A. "Making best use of model evaluations to
|
||||
compute sensitivity indices." Computer Physics Communications,
|
||||
145(2):280-297, :doi:`10.1016/S0010-4655(02)00280-1`, 2002.
|
||||
.. [4] Saltelli, A., M. Ratto, T. Andres, F. Campolongo, J. Cariboni,
|
||||
D. Gatelli, M. Saisana, and S. Tarantola. "Global Sensitivity Analysis.
|
||||
The Primer." 2007.
|
||||
.. [5] Saltelli, A., P. Annoni, I. Azzini, F. Campolongo, M. Ratto, and
|
||||
S. Tarantola. "Variance based sensitivity analysis of model
|
||||
output. Design and estimator for the total sensitivity index."
|
||||
Computer Physics Communications, 181(2):259-270,
|
||||
:doi:`10.1016/j.cpc.2009.09.018`, 2010.
|
||||
.. [6] Ishigami, T. and T. Homma. "An importance quantification technique
|
||||
in uncertainty analysis for computer models." IEEE,
|
||||
:doi:`10.1109/ISUMA.1990.151285`, 1990.
|
||||
|
||||
Examples
|
||||
--------
|
||||
The following is an example with the Ishigami function [6]_
|
||||
|
||||
.. math::
|
||||
|
||||
Y(\mathbf{x}) = \sin x_1 + 7 \sin^2 x_2 + 0.1 x_3^4 \sin x_1,
|
||||
|
||||
with :math:`\mathbf{x} \in [-\pi, \pi]^3`. This function exhibits strong
|
||||
non-linearity and non-monotonicity.
|
||||
|
||||
Remember, Sobol' indices assumes that samples are independently
|
||||
distributed. In this case we use a uniform distribution on each marginals.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats import sobol_indices, uniform
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> def f_ishigami(x):
|
||||
... f_eval = (
|
||||
... np.sin(x[0])
|
||||
... + 7 * np.sin(x[1])**2
|
||||
... + 0.1 * (x[2]**4) * np.sin(x[0])
|
||||
... )
|
||||
... return f_eval
|
||||
>>> indices = sobol_indices(
|
||||
... func=f_ishigami, n=1024,
|
||||
... dists=[
|
||||
... uniform(loc=-np.pi, scale=2*np.pi),
|
||||
... uniform(loc=-np.pi, scale=2*np.pi),
|
||||
... uniform(loc=-np.pi, scale=2*np.pi)
|
||||
... ],
|
||||
... random_state=rng
|
||||
... )
|
||||
>>> indices.first_order
|
||||
array([0.31637954, 0.43781162, 0.00318825])
|
||||
>>> indices.total_order
|
||||
array([0.56122127, 0.44287857, 0.24229595])
|
||||
|
||||
Confidence interval can be obtained using bootstrapping.
|
||||
|
||||
>>> boot = indices.bootstrap()
|
||||
|
||||
Then, this information can be easily visualized.
|
||||
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> fig, axs = plt.subplots(1, 2, figsize=(9, 4))
|
||||
>>> _ = axs[0].errorbar(
|
||||
... [1, 2, 3], indices.first_order, fmt='o',
|
||||
... yerr=[
|
||||
... indices.first_order - boot.first_order.confidence_interval.low,
|
||||
... boot.first_order.confidence_interval.high - indices.first_order
|
||||
... ],
|
||||
... )
|
||||
>>> axs[0].set_ylabel("First order Sobol' indices")
|
||||
>>> axs[0].set_xlabel('Input parameters')
|
||||
>>> axs[0].set_xticks([1, 2, 3])
|
||||
>>> _ = axs[1].errorbar(
|
||||
... [1, 2, 3], indices.total_order, fmt='o',
|
||||
... yerr=[
|
||||
... indices.total_order - boot.total_order.confidence_interval.low,
|
||||
... boot.total_order.confidence_interval.high - indices.total_order
|
||||
... ],
|
||||
... )
|
||||
>>> axs[1].set_ylabel("Total order Sobol' indices")
|
||||
>>> axs[1].set_xlabel('Input parameters')
|
||||
>>> axs[1].set_xticks([1, 2, 3])
|
||||
>>> plt.tight_layout()
|
||||
>>> plt.show()
|
||||
|
||||
.. note::
|
||||
|
||||
By default, `scipy.stats.uniform` has support ``[0, 1]``.
|
||||
Using the parameters ``loc`` and ``scale``, one obtains the uniform
|
||||
distribution on ``[loc, loc + scale]``.
|
||||
|
||||
This result is particularly interesting because the first order index
|
||||
:math:`S_{x_3} = 0` whereas its total order is :math:`S_{T_{x_3}} = 0.244`.
|
||||
This means that higher order interactions with :math:`x_3` are responsible
|
||||
for the difference. Almost 25% of the observed variance
|
||||
on the QoI is due to the correlations between :math:`x_3` and :math:`x_1`,
|
||||
although :math:`x_3` by itself has no impact on the QoI.
|
||||
|
||||
The following gives a visual explanation of Sobol' indices on this
|
||||
function. Let's generate 1024 samples in :math:`[-\pi, \pi]^3` and
|
||||
calculate the value of the output.
|
||||
|
||||
>>> from scipy.stats import qmc
|
||||
>>> n_dim = 3
|
||||
>>> p_labels = ['$x_1$', '$x_2$', '$x_3$']
|
||||
>>> sample = qmc.Sobol(d=n_dim, seed=rng).random(1024)
|
||||
>>> sample = qmc.scale(
|
||||
... sample=sample,
|
||||
... l_bounds=[-np.pi, -np.pi, -np.pi],
|
||||
... u_bounds=[np.pi, np.pi, np.pi]
|
||||
... )
|
||||
>>> output = f_ishigami(sample.T)
|
||||
|
||||
Now we can do scatter plots of the output with respect to each parameter.
|
||||
This gives a visual way to understand how each parameter impacts the
|
||||
output of the function.
|
||||
|
||||
>>> fig, ax = plt.subplots(1, n_dim, figsize=(12, 4))
|
||||
>>> for i in range(n_dim):
|
||||
... xi = sample[:, i]
|
||||
... ax[i].scatter(xi, output, marker='+')
|
||||
... ax[i].set_xlabel(p_labels[i])
|
||||
>>> ax[0].set_ylabel('Y')
|
||||
>>> plt.tight_layout()
|
||||
>>> plt.show()
|
||||
|
||||
Now Sobol' goes a step further:
|
||||
by conditioning the output value by given values of the parameter
|
||||
(black lines), the conditional output mean is computed. It corresponds to
|
||||
the term :math:`\mathbb{E}(Y|x_i)`. Taking the variance of this term gives
|
||||
the numerator of the Sobol' indices.
|
||||
|
||||
>>> mini = np.min(output)
|
||||
>>> maxi = np.max(output)
|
||||
>>> n_bins = 10
|
||||
>>> bins = np.linspace(-np.pi, np.pi, num=n_bins, endpoint=False)
|
||||
>>> dx = bins[1] - bins[0]
|
||||
>>> fig, ax = plt.subplots(1, n_dim, figsize=(12, 4))
|
||||
>>> for i in range(n_dim):
|
||||
... xi = sample[:, i]
|
||||
... ax[i].scatter(xi, output, marker='+')
|
||||
... ax[i].set_xlabel(p_labels[i])
|
||||
... for bin_ in bins:
|
||||
... idx = np.where((bin_ <= xi) & (xi <= bin_ + dx))
|
||||
... xi_ = xi[idx]
|
||||
... y_ = output[idx]
|
||||
... ave_y_ = np.mean(y_)
|
||||
... ax[i].plot([bin_ + dx/2] * 2, [mini, maxi], c='k')
|
||||
... ax[i].scatter(bin_ + dx/2, ave_y_, c='r')
|
||||
>>> ax[0].set_ylabel('Y')
|
||||
>>> plt.tight_layout()
|
||||
>>> plt.show()
|
||||
|
||||
Looking at :math:`x_3`, the variance
|
||||
of the mean is zero leading to :math:`S_{x_3} = 0`. But we can further
|
||||
observe that the variance of the output is not constant along the parameter
|
||||
values of :math:`x_3`. This heteroscedasticity is explained by higher order
|
||||
interactions. Moreover, an heteroscedasticity is also noticeable on
|
||||
:math:`x_1` leading to an interaction between :math:`x_3` and :math:`x_1`.
|
||||
On :math:`x_2`, the variance seems to be constant and thus null interaction
|
||||
with this parameter can be supposed.
|
||||
|
||||
This case is fairly simple to analyse visually---although it is only a
|
||||
qualitative analysis. Nevertheless, when the number of input parameters
|
||||
increases such analysis becomes unrealistic as it would be difficult to
|
||||
conclude on high-order terms. Hence the benefit of using Sobol' indices.
|
||||
|
||||
"""
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
n_ = int(n)
|
||||
if not (n_ & (n_ - 1) == 0) or n != n_:
|
||||
raise ValueError(
|
||||
"The balance properties of Sobol' points require 'n' "
|
||||
"to be a power of 2."
|
||||
)
|
||||
n = n_
|
||||
|
||||
if not callable(method):
|
||||
indices_methods: dict[str, Callable] = {
|
||||
"saltelli_2010": saltelli_2010,
|
||||
}
|
||||
try:
|
||||
method = method.lower() # type: ignore[assignment]
|
||||
indices_method_ = indices_methods[method]
|
||||
except KeyError as exc:
|
||||
message = (
|
||||
f"{method!r} is not a valid 'method'. It must be one of"
|
||||
f" {set(indices_methods)!r} or a callable."
|
||||
)
|
||||
raise ValueError(message) from exc
|
||||
else:
|
||||
indices_method_ = method
|
||||
sig = inspect.signature(indices_method_)
|
||||
|
||||
if set(sig.parameters) != {'f_A', 'f_B', 'f_AB'}:
|
||||
message = (
|
||||
"If 'method' is a callable, it must have the following"
|
||||
f" signature: {inspect.signature(saltelli_2010)}"
|
||||
)
|
||||
raise ValueError(message)
|
||||
|
||||
def indices_method(f_A, f_B, f_AB):
|
||||
"""Wrap indices method to ensure proper output dimension.
|
||||
|
||||
1D when single output, 2D otherwise.
|
||||
"""
|
||||
return np.squeeze(indices_method_(f_A=f_A, f_B=f_B, f_AB=f_AB))
|
||||
|
||||
if callable(func):
|
||||
if dists is None:
|
||||
raise ValueError(
|
||||
"'dists' must be defined when 'func' is a callable."
|
||||
)
|
||||
|
||||
def wrapped_func(x):
|
||||
return np.atleast_2d(func(x))
|
||||
|
||||
A, B = sample_A_B(n=n, dists=dists, random_state=random_state)
|
||||
AB = sample_AB(A=A, B=B)
|
||||
|
||||
f_A = wrapped_func(A)
|
||||
|
||||
if f_A.shape[1] != n:
|
||||
raise ValueError(
|
||||
"'func' output should have a shape ``(s, -1)`` with ``s`` "
|
||||
"the number of output."
|
||||
)
|
||||
|
||||
def funcAB(AB):
|
||||
d, d, n = AB.shape
|
||||
AB = np.moveaxis(AB, 0, -1).reshape(d, n*d)
|
||||
f_AB = wrapped_func(AB)
|
||||
return np.moveaxis(f_AB.reshape((-1, n, d)), -1, 0)
|
||||
|
||||
f_B = wrapped_func(B)
|
||||
f_AB = funcAB(AB)
|
||||
else:
|
||||
message = (
|
||||
"When 'func' is a dictionary, it must contain the following "
|
||||
"keys: 'f_A', 'f_B' and 'f_AB'."
|
||||
"'f_A' and 'f_B' should have a shape ``(s, n)`` and 'f_AB' "
|
||||
"should have a shape ``(d, s, n)``."
|
||||
)
|
||||
try:
|
||||
f_A, f_B, f_AB = np.atleast_2d(
|
||||
func['f_A'], func['f_B'], func['f_AB']
|
||||
)
|
||||
except KeyError as exc:
|
||||
raise ValueError(message) from exc
|
||||
|
||||
if f_A.shape[1] != n or f_A.shape != f_B.shape or \
|
||||
f_AB.shape == f_A.shape or f_AB.shape[-1] % n != 0:
|
||||
raise ValueError(message)
|
||||
|
||||
# Normalization by mean
|
||||
# Sobol', I. and Levitan, Y. L. (1999). On the use of variance reducing
|
||||
# multipliers in monte carlo computations of a global sensitivity index.
|
||||
# Computer Physics Communications, 117(1) :52-61.
|
||||
mean = np.mean([f_A, f_B], axis=(0, -1)).reshape(-1, 1)
|
||||
f_A -= mean
|
||||
f_B -= mean
|
||||
f_AB -= mean
|
||||
|
||||
# Compute indices
|
||||
# Filter warnings for constant output as var = 0
|
||||
with np.errstate(divide='ignore', invalid='ignore'):
|
||||
first_order, total_order = indices_method(f_A=f_A, f_B=f_B, f_AB=f_AB)
|
||||
|
||||
# null variance means null indices
|
||||
first_order[~np.isfinite(first_order)] = 0
|
||||
total_order[~np.isfinite(total_order)] = 0
|
||||
|
||||
res = dict(
|
||||
first_order=first_order,
|
||||
total_order=total_order,
|
||||
_indices_method=indices_method,
|
||||
_f_A=f_A,
|
||||
_f_B=f_B,
|
||||
_f_AB=f_AB
|
||||
)
|
||||
|
||||
if callable(func):
|
||||
res.update(
|
||||
dict(
|
||||
_A=A,
|
||||
_B=B,
|
||||
_AB=AB,
|
||||
)
|
||||
)
|
||||
|
||||
return SobolResult(**res)
|
||||
Binary file not shown.
54
venv/lib/python3.12/site-packages/scipy/stats/_sobol.pyi
Normal file
54
venv/lib/python3.12/site-packages/scipy/stats/_sobol.pyi
Normal file
@ -0,0 +1,54 @@
|
||||
import numpy as np
|
||||
from scipy._lib._util import IntNumber
|
||||
from typing import Literal
|
||||
|
||||
def _initialize_v(
|
||||
v : np.ndarray,
|
||||
dim : IntNumber,
|
||||
bits: IntNumber
|
||||
) -> None: ...
|
||||
|
||||
def _cscramble (
|
||||
dim : IntNumber,
|
||||
bits: IntNumber,
|
||||
ltm : np.ndarray,
|
||||
sv: np.ndarray
|
||||
) -> None: ...
|
||||
|
||||
def _fill_p_cumulative(
|
||||
p: np.ndarray,
|
||||
p_cumulative: np.ndarray
|
||||
) -> None: ...
|
||||
|
||||
def _draw(
|
||||
n : IntNumber,
|
||||
num_gen: IntNumber,
|
||||
dim: IntNumber,
|
||||
scale: float,
|
||||
sv: np.ndarray,
|
||||
quasi: np.ndarray,
|
||||
sample: np.ndarray
|
||||
) -> None: ...
|
||||
|
||||
def _fast_forward(
|
||||
n: IntNumber,
|
||||
num_gen: IntNumber,
|
||||
dim: IntNumber,
|
||||
sv: np.ndarray,
|
||||
quasi: np.ndarray
|
||||
) -> None: ...
|
||||
|
||||
def _categorize(
|
||||
draws: np.ndarray,
|
||||
p_cumulative: np.ndarray,
|
||||
result: np.ndarray
|
||||
) -> None: ...
|
||||
|
||||
_MAXDIM: Literal[21201]
|
||||
_MAXDEG: Literal[18]
|
||||
|
||||
def _test_find_index(
|
||||
p_cumulative: np.ndarray,
|
||||
size: int,
|
||||
value: float
|
||||
) -> int: ...
|
||||
Binary file not shown.
Binary file not shown.
10
venv/lib/python3.12/site-packages/scipy/stats/_stats.pxd
Normal file
10
venv/lib/python3.12/site-packages/scipy/stats/_stats.pxd
Normal file
@ -0,0 +1,10 @@
|
||||
# destined to be used in a LowLevelCallable
|
||||
|
||||
cdef double _geninvgauss_pdf(double x, void *user_data) noexcept nogil
|
||||
cdef double _studentized_range_cdf(int n, double[2] x, void *user_data) noexcept nogil
|
||||
cdef double _studentized_range_cdf_asymptotic(double z, void *user_data) noexcept nogil
|
||||
cdef double _studentized_range_pdf(int n, double[2] x, void *user_data) noexcept nogil
|
||||
cdef double _studentized_range_pdf_asymptotic(double z, void *user_data) noexcept nogil
|
||||
cdef double _studentized_range_moment(int n, double[3] x_arg, void *user_data) noexcept nogil
|
||||
cdef double _genhyperbolic_pdf(double x, void *user_data) noexcept nogil
|
||||
cdef double _genhyperbolic_logpdf(double x, void *user_data) noexcept nogil
|
||||
@ -0,0 +1,303 @@
|
||||
import warnings
|
||||
import numpy as np
|
||||
from . import distributions
|
||||
from .._lib._bunch import _make_tuple_bunch
|
||||
from ._stats_pythran import siegelslopes as siegelslopes_pythran
|
||||
|
||||
__all__ = ['_find_repeats', 'theilslopes', 'siegelslopes']
|
||||
|
||||
# This is not a namedtuple for backwards compatibility. See PR #12983
|
||||
TheilslopesResult = _make_tuple_bunch('TheilslopesResult',
|
||||
['slope', 'intercept',
|
||||
'low_slope', 'high_slope'])
|
||||
SiegelslopesResult = _make_tuple_bunch('SiegelslopesResult',
|
||||
['slope', 'intercept'])
|
||||
|
||||
|
||||
def theilslopes(y, x=None, alpha=0.95, method='separate'):
|
||||
r"""
|
||||
Computes the Theil-Sen estimator for a set of points (x, y).
|
||||
|
||||
`theilslopes` implements a method for robust linear regression. It
|
||||
computes the slope as the median of all slopes between paired values.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array_like
|
||||
Dependent variable.
|
||||
x : array_like or None, optional
|
||||
Independent variable. If None, use ``arange(len(y))`` instead.
|
||||
alpha : float, optional
|
||||
Confidence degree between 0 and 1. Default is 95% confidence.
|
||||
Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are
|
||||
interpreted as "find the 90% confidence interval".
|
||||
method : {'joint', 'separate'}, optional
|
||||
Method to be used for computing estimate for intercept.
|
||||
Following methods are supported,
|
||||
|
||||
* 'joint': Uses np.median(y - slope * x) as intercept.
|
||||
* 'separate': Uses np.median(y) - slope * np.median(x)
|
||||
as intercept.
|
||||
|
||||
The default is 'separate'.
|
||||
|
||||
.. versionadded:: 1.8.0
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : ``TheilslopesResult`` instance
|
||||
The return value is an object with the following attributes:
|
||||
|
||||
slope : float
|
||||
Theil slope.
|
||||
intercept : float
|
||||
Intercept of the Theil line.
|
||||
low_slope : float
|
||||
Lower bound of the confidence interval on `slope`.
|
||||
high_slope : float
|
||||
Upper bound of the confidence interval on `slope`.
|
||||
|
||||
See Also
|
||||
--------
|
||||
siegelslopes : a similar technique using repeated medians
|
||||
|
||||
Notes
|
||||
-----
|
||||
The implementation of `theilslopes` follows [1]_. The intercept is
|
||||
not defined in [1]_, and here it is defined as ``median(y) -
|
||||
slope*median(x)``, which is given in [3]_. Other definitions of
|
||||
the intercept exist in the literature such as ``median(y - slope*x)``
|
||||
in [4]_. The approach to compute the intercept can be determined by the
|
||||
parameter ``method``. A confidence interval for the intercept is not
|
||||
given as this question is not addressed in [1]_.
|
||||
|
||||
For compatibility with older versions of SciPy, the return value acts
|
||||
like a ``namedtuple`` of length 4, with fields ``slope``, ``intercept``,
|
||||
``low_slope``, and ``high_slope``, so one can continue to write::
|
||||
|
||||
slope, intercept, low_slope, high_slope = theilslopes(y, x)
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] P.K. Sen, "Estimates of the regression coefficient based on
|
||||
Kendall's tau", J. Am. Stat. Assoc., Vol. 63, pp. 1379-1389, 1968.
|
||||
.. [2] H. Theil, "A rank-invariant method of linear and polynomial
|
||||
regression analysis I, II and III", Nederl. Akad. Wetensch., Proc.
|
||||
53:, pp. 386-392, pp. 521-525, pp. 1397-1412, 1950.
|
||||
.. [3] W.L. Conover, "Practical nonparametric statistics", 2nd ed.,
|
||||
John Wiley and Sons, New York, pp. 493.
|
||||
.. [4] https://en.wikipedia.org/wiki/Theil%E2%80%93Sen_estimator
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> import matplotlib.pyplot as plt
|
||||
|
||||
>>> x = np.linspace(-5, 5, num=150)
|
||||
>>> y = x + np.random.normal(size=x.size)
|
||||
>>> y[11:15] += 10 # add outliers
|
||||
>>> y[-5:] -= 7
|
||||
|
||||
Compute the slope, intercept and 90% confidence interval. For comparison,
|
||||
also compute the least-squares fit with `linregress`:
|
||||
|
||||
>>> res = stats.theilslopes(y, x, 0.90, method='separate')
|
||||
>>> lsq_res = stats.linregress(x, y)
|
||||
|
||||
Plot the results. The Theil-Sen regression line is shown in red, with the
|
||||
dashed red lines illustrating the confidence interval of the slope (note
|
||||
that the dashed red lines are not the confidence interval of the regression
|
||||
as the confidence interval of the intercept is not included). The green
|
||||
line shows the least-squares fit for comparison.
|
||||
|
||||
>>> fig = plt.figure()
|
||||
>>> ax = fig.add_subplot(111)
|
||||
>>> ax.plot(x, y, 'b.')
|
||||
>>> ax.plot(x, res[1] + res[0] * x, 'r-')
|
||||
>>> ax.plot(x, res[1] + res[2] * x, 'r--')
|
||||
>>> ax.plot(x, res[1] + res[3] * x, 'r--')
|
||||
>>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
if method not in ['joint', 'separate']:
|
||||
raise ValueError("method must be either 'joint' or 'separate'."
|
||||
f"'{method}' is invalid.")
|
||||
# We copy both x and y so we can use _find_repeats.
|
||||
y = np.array(y, dtype=float, copy=True).ravel()
|
||||
if x is None:
|
||||
x = np.arange(len(y), dtype=float)
|
||||
else:
|
||||
x = np.array(x, dtype=float, copy=True).ravel()
|
||||
if len(x) != len(y):
|
||||
raise ValueError(f"Incompatible lengths ! ({len(y)}<>{len(x)})")
|
||||
|
||||
# Compute sorted slopes only when deltax > 0
|
||||
deltax = x[:, np.newaxis] - x
|
||||
deltay = y[:, np.newaxis] - y
|
||||
slopes = deltay[deltax > 0] / deltax[deltax > 0]
|
||||
if not slopes.size:
|
||||
msg = "All `x` coordinates are identical."
|
||||
warnings.warn(msg, RuntimeWarning, stacklevel=2)
|
||||
slopes.sort()
|
||||
medslope = np.median(slopes)
|
||||
if method == 'joint':
|
||||
medinter = np.median(y - medslope * x)
|
||||
else:
|
||||
medinter = np.median(y) - medslope * np.median(x)
|
||||
# Now compute confidence intervals
|
||||
if alpha > 0.5:
|
||||
alpha = 1. - alpha
|
||||
|
||||
z = distributions.norm.ppf(alpha / 2.)
|
||||
# This implements (2.6) from Sen (1968)
|
||||
_, nxreps = _find_repeats(x)
|
||||
_, nyreps = _find_repeats(y)
|
||||
nt = len(slopes) # N in Sen (1968)
|
||||
ny = len(y) # n in Sen (1968)
|
||||
# Equation 2.6 in Sen (1968):
|
||||
sigsq = 1/18. * (ny * (ny-1) * (2*ny+5) -
|
||||
sum(k * (k-1) * (2*k + 5) for k in nxreps) -
|
||||
sum(k * (k-1) * (2*k + 5) for k in nyreps))
|
||||
# Find the confidence interval indices in `slopes`
|
||||
try:
|
||||
sigma = np.sqrt(sigsq)
|
||||
Ru = min(int(np.round((nt - z*sigma)/2.)), len(slopes)-1)
|
||||
Rl = max(int(np.round((nt + z*sigma)/2.)) - 1, 0)
|
||||
delta = slopes[[Rl, Ru]]
|
||||
except (ValueError, IndexError):
|
||||
delta = (np.nan, np.nan)
|
||||
|
||||
return TheilslopesResult(slope=medslope, intercept=medinter,
|
||||
low_slope=delta[0], high_slope=delta[1])
|
||||
|
||||
|
||||
def _find_repeats(arr):
|
||||
# This function assumes it may clobber its input.
|
||||
if len(arr) == 0:
|
||||
return np.array(0, np.float64), np.array(0, np.intp)
|
||||
|
||||
# XXX This cast was previously needed for the Fortran implementation,
|
||||
# should we ditch it?
|
||||
arr = np.asarray(arr, np.float64).ravel()
|
||||
arr.sort()
|
||||
|
||||
# Taken from NumPy 1.9's np.unique.
|
||||
change = np.concatenate(([True], arr[1:] != arr[:-1]))
|
||||
unique = arr[change]
|
||||
change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))
|
||||
freq = np.diff(change_idx)
|
||||
atleast2 = freq > 1
|
||||
return unique[atleast2], freq[atleast2]
|
||||
|
||||
|
||||
def siegelslopes(y, x=None, method="hierarchical"):
|
||||
r"""
|
||||
Computes the Siegel estimator for a set of points (x, y).
|
||||
|
||||
`siegelslopes` implements a method for robust linear regression
|
||||
using repeated medians (see [1]_) to fit a line to the points (x, y).
|
||||
The method is robust to outliers with an asymptotic breakdown point
|
||||
of 50%.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
y : array_like
|
||||
Dependent variable.
|
||||
x : array_like or None, optional
|
||||
Independent variable. If None, use ``arange(len(y))`` instead.
|
||||
method : {'hierarchical', 'separate'}
|
||||
If 'hierarchical', estimate the intercept using the estimated
|
||||
slope ``slope`` (default option).
|
||||
If 'separate', estimate the intercept independent of the estimated
|
||||
slope. See Notes for details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : ``SiegelslopesResult`` instance
|
||||
The return value is an object with the following attributes:
|
||||
|
||||
slope : float
|
||||
Estimate of the slope of the regression line.
|
||||
intercept : float
|
||||
Estimate of the intercept of the regression line.
|
||||
|
||||
See Also
|
||||
--------
|
||||
theilslopes : a similar technique without repeated medians
|
||||
|
||||
Notes
|
||||
-----
|
||||
With ``n = len(y)``, compute ``m_j`` as the median of
|
||||
the slopes from the point ``(x[j], y[j])`` to all other `n-1` points.
|
||||
``slope`` is then the median of all slopes ``m_j``.
|
||||
Two ways are given to estimate the intercept in [1]_ which can be chosen
|
||||
via the parameter ``method``.
|
||||
The hierarchical approach uses the estimated slope ``slope``
|
||||
and computes ``intercept`` as the median of ``y - slope*x``.
|
||||
The other approach estimates the intercept separately as follows: for
|
||||
each point ``(x[j], y[j])``, compute the intercepts of all the `n-1`
|
||||
lines through the remaining points and take the median ``i_j``.
|
||||
``intercept`` is the median of the ``i_j``.
|
||||
|
||||
The implementation computes `n` times the median of a vector of size `n`
|
||||
which can be slow for large vectors. There are more efficient algorithms
|
||||
(see [2]_) which are not implemented here.
|
||||
|
||||
For compatibility with older versions of SciPy, the return value acts
|
||||
like a ``namedtuple`` of length 2, with fields ``slope`` and
|
||||
``intercept``, so one can continue to write::
|
||||
|
||||
slope, intercept = siegelslopes(y, x)
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] A. Siegel, "Robust Regression Using Repeated Medians",
|
||||
Biometrika, Vol. 69, pp. 242-244, 1982.
|
||||
|
||||
.. [2] A. Stein and M. Werman, "Finding the repeated median regression
|
||||
line", Proceedings of the Third Annual ACM-SIAM Symposium on
|
||||
Discrete Algorithms, pp. 409-413, 1992.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy import stats
|
||||
>>> import matplotlib.pyplot as plt
|
||||
|
||||
>>> x = np.linspace(-5, 5, num=150)
|
||||
>>> y = x + np.random.normal(size=x.size)
|
||||
>>> y[11:15] += 10 # add outliers
|
||||
>>> y[-5:] -= 7
|
||||
|
||||
Compute the slope and intercept. For comparison, also compute the
|
||||
least-squares fit with `linregress`:
|
||||
|
||||
>>> res = stats.siegelslopes(y, x)
|
||||
>>> lsq_res = stats.linregress(x, y)
|
||||
|
||||
Plot the results. The Siegel regression line is shown in red. The green
|
||||
line shows the least-squares fit for comparison.
|
||||
|
||||
>>> fig = plt.figure()
|
||||
>>> ax = fig.add_subplot(111)
|
||||
>>> ax.plot(x, y, 'b.')
|
||||
>>> ax.plot(x, res[1] + res[0] * x, 'r-')
|
||||
>>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
if method not in ['hierarchical', 'separate']:
|
||||
raise ValueError("method can only be 'hierarchical' or 'separate'")
|
||||
y = np.asarray(y).ravel()
|
||||
if x is None:
|
||||
x = np.arange(len(y), dtype=float)
|
||||
else:
|
||||
x = np.asarray(x, dtype=float).ravel()
|
||||
if len(x) != len(y):
|
||||
raise ValueError(f"Incompatible lengths ! ({len(y)}<>{len(x)})")
|
||||
dtype = np.result_type(x, y, np.float32) # use at least float32
|
||||
y, x = y.astype(dtype), x.astype(dtype)
|
||||
medslope, medinter = siegelslopes_pythran(y, x, method)
|
||||
return SiegelslopesResult(slope=medslope, intercept=medinter)
|
||||
10970
venv/lib/python3.12/site-packages/scipy/stats/_stats_py.py
Normal file
10970
venv/lib/python3.12/site-packages/scipy/stats/_stats_py.py
Normal file
File diff suppressed because it is too large
Load Diff
Binary file not shown.
686
venv/lib/python3.12/site-packages/scipy/stats/_survival.py
Normal file
686
venv/lib/python3.12/site-packages/scipy/stats/_survival.py
Normal file
@ -0,0 +1,686 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import TYPE_CHECKING
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
from scipy import special, interpolate, stats
|
||||
from scipy.stats._censored_data import CensoredData
|
||||
from scipy.stats._common import ConfidenceInterval
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from typing import Literal
|
||||
import numpy.typing as npt
|
||||
|
||||
|
||||
__all__ = ['ecdf', 'logrank']
|
||||
|
||||
|
||||
@dataclass
|
||||
class EmpiricalDistributionFunction:
|
||||
"""An empirical distribution function produced by `scipy.stats.ecdf`
|
||||
|
||||
Attributes
|
||||
----------
|
||||
quantiles : ndarray
|
||||
The unique values of the sample from which the
|
||||
`EmpiricalDistributionFunction` was estimated.
|
||||
probabilities : ndarray
|
||||
The point estimates of the cumulative distribution function (CDF) or
|
||||
its complement, the survival function (SF), corresponding with
|
||||
`quantiles`.
|
||||
"""
|
||||
quantiles: np.ndarray
|
||||
probabilities: np.ndarray
|
||||
# Exclude these from __str__
|
||||
_n: np.ndarray = field(repr=False) # number "at risk"
|
||||
_d: np.ndarray = field(repr=False) # number of "deaths"
|
||||
_sf: np.ndarray = field(repr=False) # survival function for var estimate
|
||||
_kind: str = field(repr=False) # type of function: "cdf" or "sf"
|
||||
|
||||
def __init__(self, q, p, n, d, kind):
|
||||
self.probabilities = p
|
||||
self.quantiles = q
|
||||
self._n = n
|
||||
self._d = d
|
||||
self._sf = p if kind == 'sf' else 1 - p
|
||||
self._kind = kind
|
||||
|
||||
f0 = 1 if kind == 'sf' else 0 # leftmost function value
|
||||
f1 = 1 - f0
|
||||
# fill_value can't handle edge cases at infinity
|
||||
x = np.insert(q, [0, len(q)], [-np.inf, np.inf])
|
||||
y = np.insert(p, [0, len(p)], [f0, f1])
|
||||
# `or` conditions handle the case of empty x, points
|
||||
self._f = interpolate.interp1d(x, y, kind='previous',
|
||||
assume_sorted=True)
|
||||
|
||||
def evaluate(self, x):
|
||||
"""Evaluate the empirical CDF/SF function at the input.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : ndarray
|
||||
Argument to the CDF/SF
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : ndarray
|
||||
The CDF/SF evaluated at the input
|
||||
"""
|
||||
return self._f(x)
|
||||
|
||||
def plot(self, ax=None, **matplotlib_kwargs):
|
||||
"""Plot the empirical distribution function
|
||||
|
||||
Available only if ``matplotlib`` is installed.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ax : matplotlib.axes.Axes
|
||||
Axes object to draw the plot onto, otherwise uses the current Axes.
|
||||
|
||||
**matplotlib_kwargs : dict, optional
|
||||
Keyword arguments passed directly to `matplotlib.axes.Axes.step`.
|
||||
Unless overridden, ``where='post'``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
lines : list of `matplotlib.lines.Line2D`
|
||||
Objects representing the plotted data
|
||||
"""
|
||||
try:
|
||||
import matplotlib # noqa: F401
|
||||
except ModuleNotFoundError as exc:
|
||||
message = "matplotlib must be installed to use method `plot`."
|
||||
raise ModuleNotFoundError(message) from exc
|
||||
|
||||
if ax is None:
|
||||
import matplotlib.pyplot as plt
|
||||
ax = plt.gca()
|
||||
|
||||
kwargs = {'where': 'post'}
|
||||
kwargs.update(matplotlib_kwargs)
|
||||
|
||||
delta = np.ptp(self.quantiles)*0.05 # how far past sample edge to plot
|
||||
q = self.quantiles
|
||||
q = [q[0] - delta] + list(q) + [q[-1] + delta]
|
||||
|
||||
return ax.step(q, self.evaluate(q), **kwargs)
|
||||
|
||||
def confidence_interval(self, confidence_level=0.95, *, method='linear'):
|
||||
"""Compute a confidence interval around the CDF/SF point estimate
|
||||
|
||||
Parameters
|
||||
----------
|
||||
confidence_level : float, default: 0.95
|
||||
Confidence level for the computed confidence interval
|
||||
|
||||
method : str, {"linear", "log-log"}
|
||||
Method used to compute the confidence interval. Options are
|
||||
"linear" for the conventional Greenwood confidence interval
|
||||
(default) and "log-log" for the "exponential Greenwood",
|
||||
log-negative-log-transformed confidence interval.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ci : ``ConfidenceInterval``
|
||||
An object with attributes ``low`` and ``high``, instances of
|
||||
`~scipy.stats._result_classes.EmpiricalDistributionFunction` that
|
||||
represent the lower and upper bounds (respectively) of the
|
||||
confidence interval.
|
||||
|
||||
Notes
|
||||
-----
|
||||
Confidence intervals are computed according to the Greenwood formula
|
||||
(``method='linear'``) or the more recent "exponential Greenwood"
|
||||
formula (``method='log-log'``) as described in [1]_. The conventional
|
||||
Greenwood formula can result in lower confidence limits less than 0
|
||||
and upper confidence limits greater than 1; these are clipped to the
|
||||
unit interval. NaNs may be produced by either method; these are
|
||||
features of the formulas.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Sawyer, Stanley. "The Greenwood and Exponential Greenwood
|
||||
Confidence Intervals in Survival Analysis."
|
||||
https://www.math.wustl.edu/~sawyer/handouts/greenwood.pdf
|
||||
|
||||
"""
|
||||
message = ("Confidence interval bounds do not implement a "
|
||||
"`confidence_interval` method.")
|
||||
if self._n is None:
|
||||
raise NotImplementedError(message)
|
||||
|
||||
methods = {'linear': self._linear_ci,
|
||||
'log-log': self._loglog_ci}
|
||||
|
||||
message = f"`method` must be one of {set(methods)}."
|
||||
if method.lower() not in methods:
|
||||
raise ValueError(message)
|
||||
|
||||
message = "`confidence_level` must be a scalar between 0 and 1."
|
||||
confidence_level = np.asarray(confidence_level)[()]
|
||||
if confidence_level.shape or not (0 <= confidence_level <= 1):
|
||||
raise ValueError(message)
|
||||
|
||||
method_fun = methods[method.lower()]
|
||||
low, high = method_fun(confidence_level)
|
||||
|
||||
message = ("The confidence interval is undefined at some observations."
|
||||
" This is a feature of the mathematical formula used, not"
|
||||
" an error in its implementation.")
|
||||
if np.any(np.isnan(low) | np.isnan(high)):
|
||||
warnings.warn(message, RuntimeWarning, stacklevel=2)
|
||||
|
||||
low, high = np.clip(low, 0, 1), np.clip(high, 0, 1)
|
||||
low = EmpiricalDistributionFunction(self.quantiles, low, None, None,
|
||||
self._kind)
|
||||
high = EmpiricalDistributionFunction(self.quantiles, high, None, None,
|
||||
self._kind)
|
||||
return ConfidenceInterval(low, high)
|
||||
|
||||
def _linear_ci(self, confidence_level):
|
||||
sf, d, n = self._sf, self._d, self._n
|
||||
# When n == d, Greenwood's formula divides by zero.
|
||||
# When s != 0, this can be ignored: var == inf, and CI is [0, 1]
|
||||
# When s == 0, this results in NaNs. Produce an informative warning.
|
||||
with np.errstate(divide='ignore', invalid='ignore'):
|
||||
var = sf ** 2 * np.cumsum(d / (n * (n - d)))
|
||||
|
||||
se = np.sqrt(var)
|
||||
z = special.ndtri(1 / 2 + confidence_level / 2)
|
||||
|
||||
z_se = z * se
|
||||
low = self.probabilities - z_se
|
||||
high = self.probabilities + z_se
|
||||
|
||||
return low, high
|
||||
|
||||
def _loglog_ci(self, confidence_level):
|
||||
sf, d, n = self._sf, self._d, self._n
|
||||
|
||||
with np.errstate(divide='ignore', invalid='ignore'):
|
||||
var = 1 / np.log(sf) ** 2 * np.cumsum(d / (n * (n - d)))
|
||||
|
||||
se = np.sqrt(var)
|
||||
z = special.ndtri(1 / 2 + confidence_level / 2)
|
||||
|
||||
with np.errstate(divide='ignore'):
|
||||
lnl_points = np.log(-np.log(sf))
|
||||
|
||||
z_se = z * se
|
||||
low = np.exp(-np.exp(lnl_points + z_se))
|
||||
high = np.exp(-np.exp(lnl_points - z_se))
|
||||
if self._kind == "cdf":
|
||||
low, high = 1-high, 1-low
|
||||
|
||||
return low, high
|
||||
|
||||
|
||||
@dataclass
|
||||
class ECDFResult:
|
||||
""" Result object returned by `scipy.stats.ecdf`
|
||||
|
||||
Attributes
|
||||
----------
|
||||
cdf : `~scipy.stats._result_classes.EmpiricalDistributionFunction`
|
||||
An object representing the empirical cumulative distribution function.
|
||||
sf : `~scipy.stats._result_classes.EmpiricalDistributionFunction`
|
||||
An object representing the complement of the empirical cumulative
|
||||
distribution function.
|
||||
"""
|
||||
cdf: EmpiricalDistributionFunction
|
||||
sf: EmpiricalDistributionFunction
|
||||
|
||||
def __init__(self, q, cdf, sf, n, d):
|
||||
self.cdf = EmpiricalDistributionFunction(q, cdf, n, d, "cdf")
|
||||
self.sf = EmpiricalDistributionFunction(q, sf, n, d, "sf")
|
||||
|
||||
|
||||
def _iv_CensoredData(
|
||||
sample: npt.ArrayLike | CensoredData, param_name: str = 'sample'
|
||||
) -> CensoredData:
|
||||
"""Attempt to convert `sample` to `CensoredData`."""
|
||||
if not isinstance(sample, CensoredData):
|
||||
try: # takes care of input standardization/validation
|
||||
sample = CensoredData(uncensored=sample)
|
||||
except ValueError as e:
|
||||
message = str(e).replace('uncensored', param_name)
|
||||
raise type(e)(message) from e
|
||||
return sample
|
||||
|
||||
|
||||
def ecdf(sample: npt.ArrayLike | CensoredData) -> ECDFResult:
|
||||
"""Empirical cumulative distribution function of a sample.
|
||||
|
||||
The empirical cumulative distribution function (ECDF) is a step function
|
||||
estimate of the CDF of the distribution underlying a sample. This function
|
||||
returns objects representing both the empirical distribution function and
|
||||
its complement, the empirical survival function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sample : 1D array_like or `scipy.stats.CensoredData`
|
||||
Besides array_like, instances of `scipy.stats.CensoredData` containing
|
||||
uncensored and right-censored observations are supported. Currently,
|
||||
other instances of `scipy.stats.CensoredData` will result in a
|
||||
``NotImplementedError``.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : `~scipy.stats._result_classes.ECDFResult`
|
||||
An object with the following attributes.
|
||||
|
||||
cdf : `~scipy.stats._result_classes.EmpiricalDistributionFunction`
|
||||
An object representing the empirical cumulative distribution
|
||||
function.
|
||||
sf : `~scipy.stats._result_classes.EmpiricalDistributionFunction`
|
||||
An object representing the empirical survival function.
|
||||
|
||||
The `cdf` and `sf` attributes themselves have the following attributes.
|
||||
|
||||
quantiles : ndarray
|
||||
The unique values in the sample that defines the empirical CDF/SF.
|
||||
probabilities : ndarray
|
||||
The point estimates of the probabilities corresponding with
|
||||
`quantiles`.
|
||||
|
||||
And the following methods:
|
||||
|
||||
evaluate(x) :
|
||||
Evaluate the CDF/SF at the argument.
|
||||
|
||||
plot(ax) :
|
||||
Plot the CDF/SF on the provided axes.
|
||||
|
||||
confidence_interval(confidence_level=0.95) :
|
||||
Compute the confidence interval around the CDF/SF at the values in
|
||||
`quantiles`.
|
||||
|
||||
Notes
|
||||
-----
|
||||
When each observation of the sample is a precise measurement, the ECDF
|
||||
steps up by ``1/len(sample)`` at each of the observations [1]_.
|
||||
|
||||
When observations are lower bounds, upper bounds, or both upper and lower
|
||||
bounds, the data is said to be "censored", and `sample` may be provided as
|
||||
an instance of `scipy.stats.CensoredData`.
|
||||
|
||||
For right-censored data, the ECDF is given by the Kaplan-Meier estimator
|
||||
[2]_; other forms of censoring are not supported at this time.
|
||||
|
||||
Confidence intervals are computed according to the Greenwood formula or the
|
||||
more recent "Exponential Greenwood" formula as described in [4]_.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Conover, William Jay. Practical nonparametric statistics. Vol. 350.
|
||||
John Wiley & Sons, 1999.
|
||||
|
||||
.. [2] Kaplan, Edward L., and Paul Meier. "Nonparametric estimation from
|
||||
incomplete observations." Journal of the American statistical
|
||||
association 53.282 (1958): 457-481.
|
||||
|
||||
.. [3] Goel, Manish Kumar, Pardeep Khanna, and Jugal Kishore.
|
||||
"Understanding survival analysis: Kaplan-Meier estimate."
|
||||
International journal of Ayurveda research 1.4 (2010): 274.
|
||||
|
||||
.. [4] Sawyer, Stanley. "The Greenwood and Exponential Greenwood Confidence
|
||||
Intervals in Survival Analysis."
|
||||
https://www.math.wustl.edu/~sawyer/handouts/greenwood.pdf
|
||||
|
||||
Examples
|
||||
--------
|
||||
**Uncensored Data**
|
||||
|
||||
As in the example from [1]_ page 79, five boys were selected at random from
|
||||
those in a single high school. Their one-mile run times were recorded as
|
||||
follows.
|
||||
|
||||
>>> sample = [6.23, 5.58, 7.06, 6.42, 5.20] # one-mile run times (minutes)
|
||||
|
||||
The empirical distribution function, which approximates the distribution
|
||||
function of one-mile run times of the population from which the boys were
|
||||
sampled, is calculated as follows.
|
||||
|
||||
>>> from scipy import stats
|
||||
>>> res = stats.ecdf(sample)
|
||||
>>> res.cdf.quantiles
|
||||
array([5.2 , 5.58, 6.23, 6.42, 7.06])
|
||||
>>> res.cdf.probabilities
|
||||
array([0.2, 0.4, 0.6, 0.8, 1. ])
|
||||
|
||||
To plot the result as a step function:
|
||||
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> ax = plt.subplot()
|
||||
>>> res.cdf.plot(ax)
|
||||
>>> ax.set_xlabel('One-Mile Run Time (minutes)')
|
||||
>>> ax.set_ylabel('Empirical CDF')
|
||||
>>> plt.show()
|
||||
|
||||
**Right-censored Data**
|
||||
|
||||
As in the example from [1]_ page 91, the lives of ten car fanbelts were
|
||||
tested. Five tests concluded because the fanbelt being tested broke, but
|
||||
the remaining tests concluded for other reasons (e.g. the study ran out of
|
||||
funding, but the fanbelt was still functional). The mileage driven
|
||||
with the fanbelts were recorded as follows.
|
||||
|
||||
>>> broken = [77, 47, 81, 56, 80] # in thousands of miles driven
|
||||
>>> unbroken = [62, 60, 43, 71, 37]
|
||||
|
||||
Precise survival times of the fanbelts that were still functional at the
|
||||
end of the tests are unknown, but they are known to exceed the values
|
||||
recorded in ``unbroken``. Therefore, these observations are said to be
|
||||
"right-censored", and the data is represented using
|
||||
`scipy.stats.CensoredData`.
|
||||
|
||||
>>> sample = stats.CensoredData(uncensored=broken, right=unbroken)
|
||||
|
||||
The empirical survival function is calculated as follows.
|
||||
|
||||
>>> res = stats.ecdf(sample)
|
||||
>>> res.sf.quantiles
|
||||
array([37., 43., 47., 56., 60., 62., 71., 77., 80., 81.])
|
||||
>>> res.sf.probabilities
|
||||
array([1. , 1. , 0.875, 0.75 , 0.75 , 0.75 , 0.75 , 0.5 , 0.25 , 0. ])
|
||||
|
||||
To plot the result as a step function:
|
||||
|
||||
>>> ax = plt.subplot()
|
||||
>>> res.cdf.plot(ax)
|
||||
>>> ax.set_xlabel('Fanbelt Survival Time (thousands of miles)')
|
||||
>>> ax.set_ylabel('Empirical SF')
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
sample = _iv_CensoredData(sample)
|
||||
|
||||
if sample.num_censored() == 0:
|
||||
res = _ecdf_uncensored(sample._uncensor())
|
||||
elif sample.num_censored() == sample._right.size:
|
||||
res = _ecdf_right_censored(sample)
|
||||
else:
|
||||
# Support additional censoring options in follow-up PRs
|
||||
message = ("Currently, only uncensored and right-censored data is "
|
||||
"supported.")
|
||||
raise NotImplementedError(message)
|
||||
|
||||
t, cdf, sf, n, d = res
|
||||
return ECDFResult(t, cdf, sf, n, d)
|
||||
|
||||
|
||||
def _ecdf_uncensored(sample):
|
||||
sample = np.sort(sample)
|
||||
x, counts = np.unique(sample, return_counts=True)
|
||||
|
||||
# [1].81 "the fraction of [observations] that are less than or equal to x
|
||||
events = np.cumsum(counts)
|
||||
n = sample.size
|
||||
cdf = events / n
|
||||
|
||||
# [1].89 "the relative frequency of the sample that exceeds x in value"
|
||||
sf = 1 - cdf
|
||||
|
||||
at_risk = np.concatenate(([n], n - events[:-1]))
|
||||
return x, cdf, sf, at_risk, counts
|
||||
|
||||
|
||||
def _ecdf_right_censored(sample):
|
||||
# It is conventional to discuss right-censored data in terms of
|
||||
# "survival time", "death", and "loss" (e.g. [2]). We'll use that
|
||||
# terminology here.
|
||||
# This implementation was influenced by the references cited and also
|
||||
# https://www.youtube.com/watch?v=lxoWsVco_iM
|
||||
# https://en.wikipedia.org/wiki/Kaplan%E2%80%93Meier_estimator
|
||||
# In retrospect it is probably most easily compared against [3].
|
||||
# Ultimately, the data needs to be sorted, so this implementation is
|
||||
# written to avoid a separate call to `unique` after sorting. In hope of
|
||||
# better performance on large datasets, it also computes survival
|
||||
# probabilities at unique times only rather than at each observation.
|
||||
tod = sample._uncensored # time of "death"
|
||||
tol = sample._right # time of "loss"
|
||||
times = np.concatenate((tod, tol))
|
||||
died = np.asarray([1]*tod.size + [0]*tol.size)
|
||||
|
||||
# sort by times
|
||||
i = np.argsort(times)
|
||||
times = times[i]
|
||||
died = died[i]
|
||||
at_risk = np.arange(times.size, 0, -1)
|
||||
|
||||
# logical indices of unique times
|
||||
j = np.diff(times, prepend=-np.inf, append=np.inf) > 0
|
||||
j_l = j[:-1] # first instances of unique times
|
||||
j_r = j[1:] # last instances of unique times
|
||||
|
||||
# get number at risk and deaths at each unique time
|
||||
t = times[j_l] # unique times
|
||||
n = at_risk[j_l] # number at risk at each unique time
|
||||
cd = np.cumsum(died)[j_r] # cumulative deaths up to/including unique times
|
||||
d = np.diff(cd, prepend=0) # deaths at each unique time
|
||||
|
||||
# compute survival function
|
||||
sf = np.cumprod((n - d) / n)
|
||||
cdf = 1 - sf
|
||||
return t, cdf, sf, n, d
|
||||
|
||||
|
||||
@dataclass
|
||||
class LogRankResult:
|
||||
"""Result object returned by `scipy.stats.logrank`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
statistic : float ndarray
|
||||
The computed statistic (defined below). Its magnitude is the
|
||||
square root of the magnitude returned by most other logrank test
|
||||
implementations.
|
||||
pvalue : float ndarray
|
||||
The computed p-value of the test.
|
||||
"""
|
||||
statistic: np.ndarray
|
||||
pvalue: np.ndarray
|
||||
|
||||
|
||||
def logrank(
|
||||
x: npt.ArrayLike | CensoredData,
|
||||
y: npt.ArrayLike | CensoredData,
|
||||
alternative: Literal['two-sided', 'less', 'greater'] = "two-sided"
|
||||
) -> LogRankResult:
|
||||
r"""Compare the survival distributions of two samples via the logrank test.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x, y : array_like or CensoredData
|
||||
Samples to compare based on their empirical survival functions.
|
||||
alternative : {'two-sided', 'less', 'greater'}, optional
|
||||
Defines the alternative hypothesis.
|
||||
|
||||
The null hypothesis is that the survival distributions of the two
|
||||
groups, say *X* and *Y*, are identical.
|
||||
|
||||
The following alternative hypotheses [4]_ are available (default is
|
||||
'two-sided'):
|
||||
|
||||
* 'two-sided': the survival distributions of the two groups are not
|
||||
identical.
|
||||
* 'less': survival of group *X* is favored: the group *X* failure rate
|
||||
function is less than the group *Y* failure rate function at some
|
||||
times.
|
||||
* 'greater': survival of group *Y* is favored: the group *X* failure
|
||||
rate function is greater than the group *Y* failure rate function at
|
||||
some times.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : `~scipy.stats._result_classes.LogRankResult`
|
||||
An object containing attributes:
|
||||
|
||||
statistic : float ndarray
|
||||
The computed statistic (defined below). Its magnitude is the
|
||||
square root of the magnitude returned by most other logrank test
|
||||
implementations.
|
||||
pvalue : float ndarray
|
||||
The computed p-value of the test.
|
||||
|
||||
See Also
|
||||
--------
|
||||
scipy.stats.ecdf
|
||||
|
||||
Notes
|
||||
-----
|
||||
The logrank test [1]_ compares the observed number of events to
|
||||
the expected number of events under the null hypothesis that the two
|
||||
samples were drawn from the same distribution. The statistic is
|
||||
|
||||
.. math::
|
||||
|
||||
Z_i = \frac{\sum_{j=1}^J(O_{i,j}-E_{i,j})}{\sqrt{\sum_{j=1}^J V_{i,j}}}
|
||||
\rightarrow \mathcal{N}(0,1)
|
||||
|
||||
where
|
||||
|
||||
.. math::
|
||||
|
||||
E_{i,j} = O_j \frac{N_{i,j}}{N_j},
|
||||
\qquad
|
||||
V_{i,j} = E_{i,j} \left(\frac{N_j-O_j}{N_j}\right)
|
||||
\left(\frac{N_j-N_{i,j}}{N_j-1}\right),
|
||||
|
||||
:math:`i` denotes the group (i.e. it may assume values :math:`x` or
|
||||
:math:`y`, or it may be omitted to refer to the combined sample)
|
||||
:math:`j` denotes the time (at which an event occurred),
|
||||
:math:`N` is the number of subjects at risk just before an event occurred,
|
||||
and :math:`O` is the observed number of events at that time.
|
||||
|
||||
The ``statistic`` :math:`Z_x` returned by `logrank` is the (signed) square
|
||||
root of the statistic returned by many other implementations. Under the
|
||||
null hypothesis, :math:`Z_x**2` is asymptotically distributed according to
|
||||
the chi-squared distribution with one degree of freedom. Consequently,
|
||||
:math:`Z_x` is asymptotically distributed according to the standard normal
|
||||
distribution. The advantage of using :math:`Z_x` is that the sign
|
||||
information (i.e. whether the observed number of events tends to be less
|
||||
than or greater than the number expected under the null hypothesis) is
|
||||
preserved, allowing `scipy.stats.logrank` to offer one-sided alternative
|
||||
hypotheses.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Mantel N. "Evaluation of survival data and two new rank order
|
||||
statistics arising in its consideration."
|
||||
Cancer Chemotherapy Reports, 50(3):163-170, PMID: 5910392, 1966
|
||||
.. [2] Bland, Altman, "The logrank test", BMJ, 328:1073,
|
||||
:doi:`10.1136/bmj.328.7447.1073`, 2004
|
||||
.. [3] "Logrank test", Wikipedia,
|
||||
https://en.wikipedia.org/wiki/Logrank_test
|
||||
.. [4] Brown, Mark. "On the choice of variance for the log rank test."
|
||||
Biometrika 71.1 (1984): 65-74.
|
||||
.. [5] Klein, John P., and Melvin L. Moeschberger. Survival analysis:
|
||||
techniques for censored and truncated data. Vol. 1230. New York:
|
||||
Springer, 2003.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Reference [2]_ compared the survival times of patients with two different
|
||||
types of recurrent malignant gliomas. The samples below record the time
|
||||
(number of weeks) for which each patient participated in the study. The
|
||||
`scipy.stats.CensoredData` class is used because the data is
|
||||
right-censored: the uncensored observations correspond with observed deaths
|
||||
whereas the censored observations correspond with the patient leaving the
|
||||
study for another reason.
|
||||
|
||||
>>> from scipy import stats
|
||||
>>> x = stats.CensoredData(
|
||||
... uncensored=[6, 13, 21, 30, 37, 38, 49, 50,
|
||||
... 63, 79, 86, 98, 202, 219],
|
||||
... right=[31, 47, 80, 82, 82, 149]
|
||||
... )
|
||||
>>> y = stats.CensoredData(
|
||||
... uncensored=[10, 10, 12, 13, 14, 15, 16, 17, 18, 20, 24, 24,
|
||||
... 25, 28,30, 33, 35, 37, 40, 40, 46, 48, 76, 81,
|
||||
... 82, 91, 112, 181],
|
||||
... right=[34, 40, 70]
|
||||
... )
|
||||
|
||||
We can calculate and visualize the empirical survival functions
|
||||
of both groups as follows.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> ax = plt.subplot()
|
||||
>>> ecdf_x = stats.ecdf(x)
|
||||
>>> ecdf_x.sf.plot(ax, label='Astrocytoma')
|
||||
>>> ecdf_y = stats.ecdf(y)
|
||||
>>> ecdf_y.sf.plot(ax, label='Glioblastoma')
|
||||
>>> ax.set_xlabel('Time to death (weeks)')
|
||||
>>> ax.set_ylabel('Empirical SF')
|
||||
>>> plt.legend()
|
||||
>>> plt.show()
|
||||
|
||||
Visual inspection of the empirical survival functions suggests that the
|
||||
survival times tend to be different between the two groups. To formally
|
||||
assess whether the difference is significant at the 1% level, we use the
|
||||
logrank test.
|
||||
|
||||
>>> res = stats.logrank(x=x, y=y)
|
||||
>>> res.statistic
|
||||
-2.73799
|
||||
>>> res.pvalue
|
||||
0.00618
|
||||
|
||||
The p-value is less than 1%, so we can consider the data to be evidence
|
||||
against the null hypothesis in favor of the alternative that there is a
|
||||
difference between the two survival functions.
|
||||
|
||||
"""
|
||||
# Input validation. `alternative` IV handled in `_get_pvalue` below.
|
||||
x = _iv_CensoredData(sample=x, param_name='x')
|
||||
y = _iv_CensoredData(sample=y, param_name='y')
|
||||
|
||||
# Combined sample. (Under H0, the two groups are identical.)
|
||||
xy = CensoredData(
|
||||
uncensored=np.concatenate((x._uncensored, y._uncensored)),
|
||||
right=np.concatenate((x._right, y._right))
|
||||
)
|
||||
|
||||
# Extract data from the combined sample
|
||||
res = ecdf(xy)
|
||||
idx = res.sf._d.astype(bool) # indices of observed events
|
||||
times_xy = res.sf.quantiles[idx] # unique times of observed events
|
||||
at_risk_xy = res.sf._n[idx] # combined number of subjects at risk
|
||||
deaths_xy = res.sf._d[idx] # combined number of events
|
||||
|
||||
# Get the number at risk within each sample.
|
||||
# First compute the number at risk in group X at each of the `times_xy`.
|
||||
# Could use `interpolate_1d`, but this is more compact.
|
||||
res_x = ecdf(x)
|
||||
i = np.searchsorted(res_x.sf.quantiles, times_xy)
|
||||
at_risk_x = np.append(res_x.sf._n, 0)[i] # 0 at risk after last time
|
||||
# Subtract from the combined number at risk to get number at risk in Y
|
||||
at_risk_y = at_risk_xy - at_risk_x
|
||||
|
||||
# Compute the variance.
|
||||
num = at_risk_x * at_risk_y * deaths_xy * (at_risk_xy - deaths_xy)
|
||||
den = at_risk_xy**2 * (at_risk_xy - 1)
|
||||
# Note: when `at_risk_xy == 1`, we would have `at_risk_xy - 1 == 0` in the
|
||||
# numerator and denominator. Simplifying the fraction symbolically, we
|
||||
# would always find the overall quotient to be zero, so don't compute it.
|
||||
i = at_risk_xy > 1
|
||||
sum_var = np.sum(num[i]/den[i])
|
||||
|
||||
# Get the observed and expected number of deaths in group X
|
||||
n_died_x = x._uncensored.size
|
||||
sum_exp_deaths_x = np.sum(at_risk_x * (deaths_xy/at_risk_xy))
|
||||
|
||||
# Compute the statistic. This is the square root of that in references.
|
||||
statistic = (n_died_x - sum_exp_deaths_x)/np.sqrt(sum_var)
|
||||
|
||||
# Equivalent to chi2(df=1).sf(statistic**2) when alternative='two-sided'
|
||||
norm = stats._stats_py._SimpleNormal()
|
||||
pvalue = stats._stats_py._get_pvalue(statistic, norm, alternative, xp=np)
|
||||
|
||||
return LogRankResult(statistic=statistic[()], pvalue=pvalue[()])
|
||||
@ -0,0 +1,199 @@
|
||||
import numpy as np
|
||||
from numpy import poly1d
|
||||
from scipy.special import beta
|
||||
|
||||
|
||||
# The following code was used to generate the Pade coefficients for the
|
||||
# Tukey Lambda variance function. Version 0.17 of mpmath was used.
|
||||
#---------------------------------------------------------------------------
|
||||
# import mpmath as mp
|
||||
#
|
||||
# mp.mp.dps = 60
|
||||
#
|
||||
# one = mp.mpf(1)
|
||||
# two = mp.mpf(2)
|
||||
#
|
||||
# def mpvar(lam):
|
||||
# if lam == 0:
|
||||
# v = mp.pi**2 / three
|
||||
# else:
|
||||
# v = (two / lam**2) * (one / (one + two*lam) -
|
||||
# mp.beta(lam + one, lam + one))
|
||||
# return v
|
||||
#
|
||||
# t = mp.taylor(mpvar, 0, 8)
|
||||
# p, q = mp.pade(t, 4, 4)
|
||||
# print("p =", [mp.fp.mpf(c) for c in p])
|
||||
# print("q =", [mp.fp.mpf(c) for c in q])
|
||||
#---------------------------------------------------------------------------
|
||||
|
||||
# Pade coefficients for the Tukey Lambda variance function.
|
||||
_tukeylambda_var_pc = [3.289868133696453, 0.7306125098871127,
|
||||
-0.5370742306855439, 0.17292046290190008,
|
||||
-0.02371146284628187]
|
||||
_tukeylambda_var_qc = [1.0, 3.683605511659861, 4.184152498888124,
|
||||
1.7660926747377275, 0.2643989311168465]
|
||||
|
||||
# numpy.poly1d instances for the numerator and denominator of the
|
||||
# Pade approximation to the Tukey Lambda variance.
|
||||
_tukeylambda_var_p = poly1d(_tukeylambda_var_pc[::-1])
|
||||
_tukeylambda_var_q = poly1d(_tukeylambda_var_qc[::-1])
|
||||
|
||||
|
||||
def tukeylambda_variance(lam):
|
||||
"""Variance of the Tukey Lambda distribution.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
lam : array_like
|
||||
The lambda values at which to compute the variance.
|
||||
|
||||
Returns
|
||||
-------
|
||||
v : ndarray
|
||||
The variance. For lam < -0.5, the variance is not defined, so
|
||||
np.nan is returned. For lam = 0.5, np.inf is returned.
|
||||
|
||||
Notes
|
||||
-----
|
||||
In an interval around lambda=0, this function uses the [4,4] Pade
|
||||
approximation to compute the variance. Otherwise it uses the standard
|
||||
formula (https://en.wikipedia.org/wiki/Tukey_lambda_distribution). The
|
||||
Pade approximation is used because the standard formula has a removable
|
||||
discontinuity at lambda = 0, and does not produce accurate numerical
|
||||
results near lambda = 0.
|
||||
"""
|
||||
lam = np.asarray(lam)
|
||||
shp = lam.shape
|
||||
lam = np.atleast_1d(lam).astype(np.float64)
|
||||
|
||||
# For absolute values of lam less than threshold, use the Pade
|
||||
# approximation.
|
||||
threshold = 0.075
|
||||
|
||||
# Play games with masks to implement the conditional evaluation of
|
||||
# the distribution.
|
||||
# lambda < -0.5: var = nan
|
||||
low_mask = lam < -0.5
|
||||
# lambda == -0.5: var = inf
|
||||
neghalf_mask = lam == -0.5
|
||||
# abs(lambda) < threshold: use Pade approximation
|
||||
small_mask = np.abs(lam) < threshold
|
||||
# else the "regular" case: use the explicit formula.
|
||||
reg_mask = ~(low_mask | neghalf_mask | small_mask)
|
||||
|
||||
# Get the 'lam' values for the cases where they are needed.
|
||||
small = lam[small_mask]
|
||||
reg = lam[reg_mask]
|
||||
|
||||
# Compute the function for each case.
|
||||
v = np.empty_like(lam)
|
||||
v[low_mask] = np.nan
|
||||
v[neghalf_mask] = np.inf
|
||||
if small.size > 0:
|
||||
# Use the Pade approximation near lambda = 0.
|
||||
v[small_mask] = _tukeylambda_var_p(small) / _tukeylambda_var_q(small)
|
||||
if reg.size > 0:
|
||||
v[reg_mask] = (2.0 / reg**2) * (1.0 / (1.0 + 2 * reg) -
|
||||
beta(reg + 1, reg + 1))
|
||||
v.shape = shp
|
||||
return v
|
||||
|
||||
|
||||
# The following code was used to generate the Pade coefficients for the
|
||||
# Tukey Lambda kurtosis function. Version 0.17 of mpmath was used.
|
||||
#---------------------------------------------------------------------------
|
||||
# import mpmath as mp
|
||||
#
|
||||
# mp.mp.dps = 60
|
||||
#
|
||||
# one = mp.mpf(1)
|
||||
# two = mp.mpf(2)
|
||||
# three = mp.mpf(3)
|
||||
# four = mp.mpf(4)
|
||||
#
|
||||
# def mpkurt(lam):
|
||||
# if lam == 0:
|
||||
# k = mp.mpf(6)/5
|
||||
# else:
|
||||
# numer = (one/(four*lam+one) - four*mp.beta(three*lam+one, lam+one) +
|
||||
# three*mp.beta(two*lam+one, two*lam+one))
|
||||
# denom = two*(one/(two*lam+one) - mp.beta(lam+one,lam+one))**2
|
||||
# k = numer / denom - three
|
||||
# return k
|
||||
#
|
||||
# # There is a bug in mpmath 0.17: when we use the 'method' keyword of the
|
||||
# # taylor function and we request a degree 9 Taylor polynomial, we actually
|
||||
# # get degree 8.
|
||||
# t = mp.taylor(mpkurt, 0, 9, method='quad', radius=0.01)
|
||||
# t = [mp.chop(c, tol=1e-15) for c in t]
|
||||
# p, q = mp.pade(t, 4, 4)
|
||||
# print("p =", [mp.fp.mpf(c) for c in p])
|
||||
# print("q =", [mp.fp.mpf(c) for c in q])
|
||||
#---------------------------------------------------------------------------
|
||||
|
||||
# Pade coefficients for the Tukey Lambda kurtosis function.
|
||||
_tukeylambda_kurt_pc = [1.2, -5.853465139719495, -22.653447381131077,
|
||||
0.20601184383406815, 4.59796302262789]
|
||||
_tukeylambda_kurt_qc = [1.0, 7.171149192233599, 12.96663094361842,
|
||||
0.43075235247853005, -2.789746758009912]
|
||||
|
||||
# numpy.poly1d instances for the numerator and denominator of the
|
||||
# Pade approximation to the Tukey Lambda kurtosis.
|
||||
_tukeylambda_kurt_p = poly1d(_tukeylambda_kurt_pc[::-1])
|
||||
_tukeylambda_kurt_q = poly1d(_tukeylambda_kurt_qc[::-1])
|
||||
|
||||
|
||||
def tukeylambda_kurtosis(lam):
|
||||
"""Kurtosis of the Tukey Lambda distribution.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
lam : array_like
|
||||
The lambda values at which to compute the variance.
|
||||
|
||||
Returns
|
||||
-------
|
||||
v : ndarray
|
||||
The variance. For lam < -0.25, the variance is not defined, so
|
||||
np.nan is returned. For lam = 0.25, np.inf is returned.
|
||||
|
||||
"""
|
||||
lam = np.asarray(lam)
|
||||
shp = lam.shape
|
||||
lam = np.atleast_1d(lam).astype(np.float64)
|
||||
|
||||
# For absolute values of lam less than threshold, use the Pade
|
||||
# approximation.
|
||||
threshold = 0.055
|
||||
|
||||
# Use masks to implement the conditional evaluation of the kurtosis.
|
||||
# lambda < -0.25: kurtosis = nan
|
||||
low_mask = lam < -0.25
|
||||
# lambda == -0.25: kurtosis = inf
|
||||
negqrtr_mask = lam == -0.25
|
||||
# lambda near 0: use Pade approximation
|
||||
small_mask = np.abs(lam) < threshold
|
||||
# else the "regular" case: use the explicit formula.
|
||||
reg_mask = ~(low_mask | negqrtr_mask | small_mask)
|
||||
|
||||
# Get the 'lam' values for the cases where they are needed.
|
||||
small = lam[small_mask]
|
||||
reg = lam[reg_mask]
|
||||
|
||||
# Compute the function for each case.
|
||||
k = np.empty_like(lam)
|
||||
k[low_mask] = np.nan
|
||||
k[negqrtr_mask] = np.inf
|
||||
if small.size > 0:
|
||||
k[small_mask] = _tukeylambda_kurt_p(small) / _tukeylambda_kurt_q(small)
|
||||
if reg.size > 0:
|
||||
numer = (1.0 / (4 * reg + 1) - 4 * beta(3 * reg + 1, reg + 1) +
|
||||
3 * beta(2 * reg + 1, 2 * reg + 1))
|
||||
denom = 2 * (1.0/(2 * reg + 1) - beta(reg + 1, reg + 1))**2
|
||||
k[reg_mask] = numer / denom - 3
|
||||
|
||||
# The return value will be a numpy array; resetting the shape ensures that
|
||||
# if `lam` was a scalar, the return value is a 0-d array.
|
||||
k.shape = shp
|
||||
return k
|
||||
Binary file not shown.
@ -0,0 +1,179 @@
|
||||
from __future__ import annotations
|
||||
import numpy as np
|
||||
from typing import (overload, Callable, NamedTuple, Protocol)
|
||||
import numpy.typing as npt
|
||||
from scipy._lib._util import SeedType
|
||||
import scipy.stats as stats
|
||||
|
||||
|
||||
ArrayLike0D = bool | int | float | complex | str | bytes | np.generic
|
||||
|
||||
|
||||
__all__: list[str]
|
||||
|
||||
|
||||
class UNURANError(RuntimeError):
|
||||
...
|
||||
|
||||
|
||||
class Method:
|
||||
@overload
|
||||
def rvs(self, size: None = ...) -> float | int: ... # type: ignore[overload-overlap]
|
||||
@overload
|
||||
def rvs(self, size: int | tuple[int, ...] = ...) -> np.ndarray: ...
|
||||
def set_random_state(self, random_state: SeedType) -> None: ...
|
||||
|
||||
|
||||
class TDRDist(Protocol):
|
||||
@property
|
||||
def pdf(self) -> Callable[..., float]: ...
|
||||
@property
|
||||
def dpdf(self) -> Callable[..., float]: ...
|
||||
@property
|
||||
def support(self) -> tuple[float, float]: ...
|
||||
|
||||
|
||||
class TransformedDensityRejection(Method):
|
||||
def __init__(self,
|
||||
dist: TDRDist,
|
||||
*,
|
||||
mode: None | float = ...,
|
||||
center: None | float = ...,
|
||||
domain: None | tuple[float, float] = ...,
|
||||
c: float = ...,
|
||||
construction_points: int | npt.ArrayLike = ...,
|
||||
use_dars: bool = ...,
|
||||
max_squeeze_hat_ratio: float = ...,
|
||||
random_state: SeedType = ...) -> None: ...
|
||||
@property
|
||||
def squeeze_hat_ratio(self) -> float: ...
|
||||
@property
|
||||
def squeeze_area(self) -> float: ...
|
||||
@overload
|
||||
def ppf_hat(self, u: ArrayLike0D) -> float: ... # type: ignore[overload-overlap]
|
||||
@overload
|
||||
def ppf_hat(self, u: npt.ArrayLike) -> np.ndarray: ...
|
||||
|
||||
|
||||
class SROUDist(Protocol):
|
||||
@property
|
||||
def pdf(self) -> Callable[..., float]: ...
|
||||
@property
|
||||
def support(self) -> tuple[float, float]: ...
|
||||
|
||||
|
||||
class SimpleRatioUniforms(Method):
|
||||
def __init__(self,
|
||||
dist: SROUDist,
|
||||
*,
|
||||
mode: None | float = ...,
|
||||
pdf_area: float = ...,
|
||||
domain: None | tuple[float, float] = ...,
|
||||
cdf_at_mode: float = ...,
|
||||
random_state: SeedType = ...) -> None: ...
|
||||
|
||||
|
||||
class UError(NamedTuple):
|
||||
max_error: float
|
||||
mean_absolute_error: float
|
||||
|
||||
class PINVDist(Protocol):
|
||||
@property
|
||||
def pdf(self) -> Callable[..., float]: ...
|
||||
@property
|
||||
def cdf(self) -> Callable[..., float]: ...
|
||||
@property
|
||||
def logpdf(self) -> Callable[..., float]: ...
|
||||
|
||||
|
||||
class NumericalInversePolynomial(Method):
|
||||
def __init__(self,
|
||||
dist: PINVDist,
|
||||
*,
|
||||
mode: None | float = ...,
|
||||
center: None | float = ...,
|
||||
domain: None | tuple[float, float] = ...,
|
||||
order: int = ...,
|
||||
u_resolution: float = ...,
|
||||
random_state: SeedType = ...) -> None: ...
|
||||
@property
|
||||
def intervals(self) -> int: ...
|
||||
@overload
|
||||
def ppf(self, u: ArrayLike0D) -> float: ... # type: ignore[overload-overlap]
|
||||
@overload
|
||||
def ppf(self, u: npt.ArrayLike) -> np.ndarray: ...
|
||||
@overload
|
||||
def cdf(self, x: ArrayLike0D) -> float: ... # type: ignore[overload-overlap]
|
||||
@overload
|
||||
def cdf(self, x: npt.ArrayLike) -> np.ndarray: ...
|
||||
def u_error(self, sample_size: int = ...) -> UError: ...
|
||||
def qrvs(self,
|
||||
size: None | int | tuple[int, ...] = ...,
|
||||
d: None | int = ...,
|
||||
qmc_engine: None | stats.qmc.QMCEngine = ...) -> npt.ArrayLike: ...
|
||||
|
||||
|
||||
class HINVDist(Protocol):
|
||||
@property
|
||||
def pdf(self) -> Callable[..., float]: ...
|
||||
@property
|
||||
def cdf(self) -> Callable[..., float]: ...
|
||||
@property
|
||||
def support(self) -> tuple[float, float]: ...
|
||||
|
||||
|
||||
class NumericalInverseHermite(Method):
|
||||
def __init__(self,
|
||||
dist: HINVDist,
|
||||
*,
|
||||
domain: None | tuple[float, float] = ...,
|
||||
order: int= ...,
|
||||
u_resolution: float = ...,
|
||||
construction_points: None | npt.ArrayLike = ...,
|
||||
max_intervals: int = ...,
|
||||
random_state: SeedType = ...) -> None: ...
|
||||
@property
|
||||
def intervals(self) -> int: ...
|
||||
@overload
|
||||
def ppf(self, u: ArrayLike0D) -> float: ... # type: ignore[overload-overlap]
|
||||
@overload
|
||||
def ppf(self, u: npt.ArrayLike) -> np.ndarray: ...
|
||||
def qrvs(self,
|
||||
size: None | int | tuple[int, ...] = ...,
|
||||
d: None | int = ...,
|
||||
qmc_engine: None | stats.qmc.QMCEngine = ...) -> npt.ArrayLike: ...
|
||||
def u_error(self, sample_size: int = ...) -> UError: ...
|
||||
|
||||
|
||||
class DAUDist(Protocol):
|
||||
@property
|
||||
def pmf(self) -> Callable[..., float]: ...
|
||||
@property
|
||||
def support(self) -> tuple[float, float]: ...
|
||||
|
||||
class DiscreteAliasUrn(Method):
|
||||
def __init__(self,
|
||||
dist: npt.ArrayLike | DAUDist,
|
||||
*,
|
||||
domain: None | tuple[float, float] = ...,
|
||||
urn_factor: float = ...,
|
||||
random_state: SeedType = ...) -> None: ...
|
||||
|
||||
|
||||
class DGTDist(Protocol):
|
||||
@property
|
||||
def pmf(self) -> Callable[..., float]: ...
|
||||
@property
|
||||
def support(self) -> tuple[float, float]: ...
|
||||
|
||||
class DiscreteGuideTable(Method):
|
||||
def __init__(self,
|
||||
dist: npt.ArrayLike | DGTDist,
|
||||
*,
|
||||
domain: None | tuple[float, float] = ...,
|
||||
guide_factor: float = ...,
|
||||
random_state: SeedType = ...) -> None: ...
|
||||
@overload
|
||||
def ppf(self, u: ArrayLike0D) -> float: ... # type: ignore[overload-overlap]
|
||||
@overload
|
||||
def ppf(self, u: npt.ArrayLike) -> np.ndarray: ...
|
||||
128
venv/lib/python3.12/site-packages/scipy/stats/_variation.py
Normal file
128
venv/lib/python3.12/site-packages/scipy/stats/_variation.py
Normal file
@ -0,0 +1,128 @@
|
||||
import numpy as np
|
||||
|
||||
from scipy._lib._util import _get_nan
|
||||
from scipy._lib._array_api import array_namespace, xp_copysign
|
||||
|
||||
from ._axis_nan_policy import _axis_nan_policy_factory
|
||||
|
||||
|
||||
@_axis_nan_policy_factory(
|
||||
lambda x: x, n_outputs=1, result_to_tuple=lambda x: (x,)
|
||||
)
|
||||
def variation(a, axis=0, nan_policy='propagate', ddof=0, *, keepdims=False):
|
||||
"""
|
||||
Compute the coefficient of variation.
|
||||
|
||||
The coefficient of variation is the standard deviation divided by the
|
||||
mean. This function is equivalent to::
|
||||
|
||||
np.std(x, axis=axis, ddof=ddof) / np.mean(x)
|
||||
|
||||
The default for ``ddof`` is 0, but many definitions of the coefficient
|
||||
of variation use the square root of the unbiased sample variance
|
||||
for the sample standard deviation, which corresponds to ``ddof=1``.
|
||||
|
||||
The function does not take the absolute value of the mean of the data,
|
||||
so the return value is negative if the mean is negative.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : array_like
|
||||
Input array.
|
||||
axis : int or None, optional
|
||||
Axis along which to calculate the coefficient of variation.
|
||||
Default is 0. If None, compute over the whole array `a`.
|
||||
nan_policy : {'propagate', 'raise', 'omit'}, optional
|
||||
Defines how to handle when input contains ``nan``.
|
||||
The following options are available:
|
||||
|
||||
* 'propagate': return ``nan``
|
||||
* 'raise': raise an exception
|
||||
* 'omit': perform the calculation with ``nan`` values omitted
|
||||
|
||||
The default is 'propagate'.
|
||||
ddof : int, optional
|
||||
Gives the "Delta Degrees Of Freedom" used when computing the
|
||||
standard deviation. The divisor used in the calculation of the
|
||||
standard deviation is ``N - ddof``, where ``N`` is the number of
|
||||
elements. `ddof` must be less than ``N``; if it isn't, the result
|
||||
will be ``nan`` or ``inf``, depending on ``N`` and the values in
|
||||
the array. By default `ddof` is zero for backwards compatibility,
|
||||
but it is recommended to use ``ddof=1`` to ensure that the sample
|
||||
standard deviation is computed as the square root of the unbiased
|
||||
sample variance.
|
||||
|
||||
Returns
|
||||
-------
|
||||
variation : ndarray
|
||||
The calculated variation along the requested axis.
|
||||
|
||||
Notes
|
||||
-----
|
||||
There are several edge cases that are handled without generating a
|
||||
warning:
|
||||
|
||||
* If both the mean and the standard deviation are zero, ``nan``
|
||||
is returned.
|
||||
* If the mean is zero and the standard deviation is nonzero, ``inf``
|
||||
is returned.
|
||||
* If the input has length zero (either because the array has zero
|
||||
length, or all the input values are ``nan`` and ``nan_policy`` is
|
||||
``'omit'``), ``nan`` is returned.
|
||||
* If the input contains ``inf``, ``nan`` is returned.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
|
||||
Probability and Statistics Tables and Formulae. Chapman & Hall: New
|
||||
York. 2000.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats import variation
|
||||
>>> variation([1, 2, 3, 4, 5], ddof=1)
|
||||
0.5270462766947299
|
||||
|
||||
Compute the variation along a given dimension of an array that contains
|
||||
a few ``nan`` values:
|
||||
|
||||
>>> x = np.array([[ 10.0, np.nan, 11.0, 19.0, 23.0, 29.0, 98.0],
|
||||
... [ 29.0, 30.0, 32.0, 33.0, 35.0, 56.0, 57.0],
|
||||
... [np.nan, np.nan, 12.0, 13.0, 16.0, 16.0, 17.0]])
|
||||
>>> variation(x, axis=1, ddof=1, nan_policy='omit')
|
||||
array([1.05109361, 0.31428986, 0.146483 ])
|
||||
|
||||
"""
|
||||
xp = array_namespace(a)
|
||||
a = xp.asarray(a)
|
||||
# `nan_policy` and `keepdims` are handled by `_axis_nan_policy`
|
||||
# `axis=None` is only handled for NumPy backend
|
||||
if axis is None:
|
||||
a = xp.reshape(a, (-1,))
|
||||
axis = 0
|
||||
|
||||
n = a.shape[axis]
|
||||
NaN = _get_nan(a)
|
||||
|
||||
if a.size == 0 or ddof > n:
|
||||
# Handle as a special case to avoid spurious warnings.
|
||||
# The return values, if any, are all nan.
|
||||
shp = list(a.shape)
|
||||
shp.pop(axis)
|
||||
result = xp.full(shp, fill_value=NaN)
|
||||
return result[()] if result.ndim == 0 else result
|
||||
|
||||
mean_a = xp.mean(a, axis=axis)
|
||||
|
||||
if ddof == n:
|
||||
# Another special case. Result is either inf or nan.
|
||||
std_a = xp.std(a, axis=axis, correction=0)
|
||||
result = xp.where(std_a > 0, xp_copysign(xp.asarray(xp.inf), mean_a), NaN)
|
||||
return result[()] if result.ndim == 0 else result
|
||||
|
||||
with np.errstate(divide='ignore', invalid='ignore'):
|
||||
std_a = xp.std(a, axis=axis, correction=ddof)
|
||||
result = std_a / mean_a
|
||||
|
||||
return result[()] if result.ndim == 0 else result
|
||||
@ -0,0 +1,38 @@
|
||||
# Warnings
|
||||
|
||||
|
||||
class DegenerateDataWarning(RuntimeWarning):
|
||||
"""Warns when data is degenerate and results may not be reliable."""
|
||||
def __init__(self, msg=None):
|
||||
if msg is None:
|
||||
msg = ("Degenerate data encountered; results may not be reliable.")
|
||||
self.args = (msg,)
|
||||
|
||||
|
||||
class ConstantInputWarning(DegenerateDataWarning):
|
||||
"""Warns when all values in data are exactly equal."""
|
||||
def __init__(self, msg=None):
|
||||
if msg is None:
|
||||
msg = ("All values in data are exactly equal; "
|
||||
"results may not be reliable.")
|
||||
self.args = (msg,)
|
||||
|
||||
|
||||
class NearConstantInputWarning(DegenerateDataWarning):
|
||||
"""Warns when all values in data are nearly equal."""
|
||||
def __init__(self, msg=None):
|
||||
if msg is None:
|
||||
msg = ("All values in data are nearly equal; "
|
||||
"results may not be reliable.")
|
||||
self.args = (msg,)
|
||||
|
||||
|
||||
# Errors
|
||||
|
||||
|
||||
class FitError(RuntimeError):
|
||||
"""Represents an error condition when fitting a distribution to data."""
|
||||
def __init__(self, msg=None):
|
||||
if msg is None:
|
||||
msg = ("An error occurred when fitting a distribution to data.")
|
||||
self.args = (msg,)
|
||||
246
venv/lib/python3.12/site-packages/scipy/stats/_wilcoxon.py
Normal file
246
venv/lib/python3.12/site-packages/scipy/stats/_wilcoxon.py
Normal file
@ -0,0 +1,246 @@
|
||||
import warnings
|
||||
import numpy as np
|
||||
|
||||
from scipy import stats
|
||||
from ._stats_py import _get_pvalue, _rankdata, _SimpleNormal
|
||||
from . import _morestats
|
||||
from ._axis_nan_policy import _broadcast_arrays
|
||||
from ._hypotests import _get_wilcoxon_distr
|
||||
from scipy._lib._util import _lazywhere, _get_nan
|
||||
|
||||
|
||||
class WilcoxonDistribution:
|
||||
|
||||
def __init__(self, n):
|
||||
n = np.asarray(n).astype(int, copy=False)
|
||||
self.n = n
|
||||
self._dists = {ni: _get_wilcoxon_distr(ni) for ni in np.unique(n)}
|
||||
|
||||
def _cdf1(self, k, n):
|
||||
pmfs = self._dists[n]
|
||||
return pmfs[:k + 1].sum()
|
||||
|
||||
def _cdf(self, k, n):
|
||||
return np.vectorize(self._cdf1, otypes=[float])(k, n)
|
||||
|
||||
def _sf1(self, k, n):
|
||||
pmfs = self._dists[n]
|
||||
return pmfs[k:].sum()
|
||||
|
||||
def _sf(self, k, n):
|
||||
return np.vectorize(self._sf1, otypes=[float])(k, n)
|
||||
|
||||
def mean(self):
|
||||
return self.n * (self.n + 1) / 4
|
||||
|
||||
def _prep(self, k):
|
||||
k = np.asarray(k).astype(int, copy=False)
|
||||
mn = self.mean()
|
||||
out = np.empty(k.shape, dtype=np.float64)
|
||||
return k, mn, out
|
||||
|
||||
def cdf(self, k):
|
||||
k, mn, out = self._prep(k)
|
||||
return _lazywhere(k <= mn, (k, self.n), self._cdf,
|
||||
f2=lambda k, n: 1 - self._sf(k+1, n))[()]
|
||||
|
||||
def sf(self, k):
|
||||
k, mn, out = self._prep(k)
|
||||
return _lazywhere(k <= mn, (k, self.n), self._sf,
|
||||
f2=lambda k, n: 1 - self._cdf(k-1, n))[()]
|
||||
|
||||
|
||||
def _wilcoxon_iv(x, y, zero_method, correction, alternative, method, axis):
|
||||
|
||||
axis = np.asarray(axis)[()]
|
||||
message = "`axis` must be an integer."
|
||||
if not np.issubdtype(axis.dtype, np.integer) or axis.ndim != 0:
|
||||
raise ValueError(message)
|
||||
|
||||
message = '`axis` must be compatible with the shape(s) of `x` (and `y`)'
|
||||
try:
|
||||
if y is None:
|
||||
x = np.asarray(x)
|
||||
d = x
|
||||
else:
|
||||
x, y = _broadcast_arrays((x, y), axis=axis)
|
||||
d = x - y
|
||||
d = np.moveaxis(d, axis, -1)
|
||||
except np.AxisError as e:
|
||||
raise ValueError(message) from e
|
||||
|
||||
message = "`x` and `y` must have the same length along `axis`."
|
||||
if y is not None and x.shape[axis] != y.shape[axis]:
|
||||
raise ValueError(message)
|
||||
|
||||
message = "`x` (and `y`, if provided) must be an array of real numbers."
|
||||
if np.issubdtype(d.dtype, np.integer):
|
||||
d = d.astype(np.float64)
|
||||
if not np.issubdtype(d.dtype, np.floating):
|
||||
raise ValueError(message)
|
||||
|
||||
zero_method = str(zero_method).lower()
|
||||
zero_methods = {"wilcox", "pratt", "zsplit"}
|
||||
message = f"`zero_method` must be one of {zero_methods}."
|
||||
if zero_method not in zero_methods:
|
||||
raise ValueError(message)
|
||||
|
||||
corrections = {True, False}
|
||||
message = f"`correction` must be one of {corrections}."
|
||||
if correction not in corrections:
|
||||
raise ValueError(message)
|
||||
|
||||
alternative = str(alternative).lower()
|
||||
alternatives = {"two-sided", "less", "greater"}
|
||||
message = f"`alternative` must be one of {alternatives}."
|
||||
if alternative not in alternatives:
|
||||
raise ValueError(message)
|
||||
|
||||
if not isinstance(method, stats.PermutationMethod):
|
||||
methods = {"auto", "approx", "exact"}
|
||||
message = (f"`method` must be one of {methods} or "
|
||||
"an instance of `stats.PermutationMethod`.")
|
||||
if method not in methods:
|
||||
raise ValueError(message)
|
||||
output_z = True if method == 'approx' else False
|
||||
|
||||
# logic unchanged here for backward compatibility
|
||||
n_zero = np.sum(d == 0, axis=-1)
|
||||
has_zeros = np.any(n_zero > 0)
|
||||
if method == "auto":
|
||||
if d.shape[-1] <= 50 and not has_zeros:
|
||||
method = "exact"
|
||||
else:
|
||||
method = "approx"
|
||||
|
||||
n_zero = np.sum(d == 0)
|
||||
if n_zero > 0 and method == "exact":
|
||||
method = "approx"
|
||||
warnings.warn("Exact p-value calculation does not work if there are "
|
||||
"zeros. Switching to normal approximation.",
|
||||
stacklevel=2)
|
||||
|
||||
if (method == "approx" and zero_method in ["wilcox", "pratt"]
|
||||
and n_zero == d.size and d.size > 0 and d.ndim == 1):
|
||||
raise ValueError("zero_method 'wilcox' and 'pratt' do not "
|
||||
"work if x - y is zero for all elements.")
|
||||
|
||||
if 0 < d.shape[-1] < 10 and method == "approx":
|
||||
warnings.warn("Sample size too small for normal approximation.", stacklevel=2)
|
||||
|
||||
return d, zero_method, correction, alternative, method, axis, output_z
|
||||
|
||||
|
||||
def _wilcoxon_statistic(d, zero_method='wilcox'):
|
||||
|
||||
i_zeros = (d == 0)
|
||||
|
||||
if zero_method == 'wilcox':
|
||||
# Wilcoxon's method for treating zeros was to remove them from
|
||||
# the calculation. We do this by replacing 0s with NaNs, which
|
||||
# are ignored anyway.
|
||||
if not d.flags['WRITEABLE']:
|
||||
d = d.copy()
|
||||
d[i_zeros] = np.nan
|
||||
|
||||
i_nan = np.isnan(d)
|
||||
n_nan = np.sum(i_nan, axis=-1)
|
||||
count = d.shape[-1] - n_nan
|
||||
|
||||
r, t = _rankdata(abs(d), 'average', return_ties=True)
|
||||
|
||||
r_plus = np.sum((d > 0) * r, axis=-1)
|
||||
r_minus = np.sum((d < 0) * r, axis=-1)
|
||||
|
||||
if zero_method == "zsplit":
|
||||
# The "zero-split" method for treating zeros is to add half their contribution
|
||||
# to r_plus and half to r_minus.
|
||||
# See gh-2263 for the origin of this method.
|
||||
r_zero_2 = np.sum(i_zeros * r, axis=-1) / 2
|
||||
r_plus += r_zero_2
|
||||
r_minus += r_zero_2
|
||||
|
||||
mn = count * (count + 1.) * 0.25
|
||||
se = count * (count + 1.) * (2. * count + 1.)
|
||||
|
||||
if zero_method == "pratt":
|
||||
# Pratt's method for treating zeros was just to modify the z-statistic.
|
||||
|
||||
# normal approximation needs to be adjusted, see Cureton (1967)
|
||||
n_zero = i_zeros.sum(axis=-1)
|
||||
mn -= n_zero * (n_zero + 1.) * 0.25
|
||||
se -= n_zero * (n_zero + 1.) * (2. * n_zero + 1.)
|
||||
|
||||
# zeros are not to be included in tie-correction.
|
||||
# any tie counts corresponding with zeros are in the 0th column
|
||||
t[i_zeros.any(axis=-1), 0] = 0
|
||||
|
||||
tie_correct = (t**3 - t).sum(axis=-1)
|
||||
se -= tie_correct/2
|
||||
se = np.sqrt(se / 24)
|
||||
|
||||
z = (r_plus - mn) / se
|
||||
|
||||
return r_plus, r_minus, se, z, count
|
||||
|
||||
|
||||
def _correction_sign(z, alternative):
|
||||
if alternative == 'greater':
|
||||
return 1
|
||||
elif alternative == 'less':
|
||||
return -1
|
||||
else:
|
||||
return np.sign(z)
|
||||
|
||||
|
||||
def _wilcoxon_nd(x, y=None, zero_method='wilcox', correction=True,
|
||||
alternative='two-sided', method='auto', axis=0):
|
||||
|
||||
temp = _wilcoxon_iv(x, y, zero_method, correction, alternative, method, axis)
|
||||
d, zero_method, correction, alternative, method, axis, output_z = temp
|
||||
|
||||
if d.size == 0:
|
||||
NaN = _get_nan(d)
|
||||
res = _morestats.WilcoxonResult(statistic=NaN, pvalue=NaN)
|
||||
if method == 'approx':
|
||||
res.zstatistic = NaN
|
||||
return res
|
||||
|
||||
r_plus, r_minus, se, z, count = _wilcoxon_statistic(d, zero_method)
|
||||
|
||||
if method == 'approx':
|
||||
if correction:
|
||||
sign = _correction_sign(z, alternative)
|
||||
z -= sign * 0.5 / se
|
||||
p = _get_pvalue(z, _SimpleNormal(), alternative, xp=np)
|
||||
elif method == 'exact':
|
||||
dist = WilcoxonDistribution(count)
|
||||
# The null distribution in `dist` is exact only if there are no ties
|
||||
# or zeros. If there are ties or zeros, the statistic can be non-
|
||||
# integral, but the null distribution is only defined for integral
|
||||
# values of the statistic. Therefore, we're conservative: round
|
||||
# non-integral statistic up before computing CDF and down before
|
||||
# computing SF. This preserves symmetry w.r.t. alternatives and
|
||||
# order of the input arguments. See gh-19872.
|
||||
if alternative == 'less':
|
||||
p = dist.cdf(np.ceil(r_plus))
|
||||
elif alternative == 'greater':
|
||||
p = dist.sf(np.floor(r_plus))
|
||||
else:
|
||||
p = 2 * np.minimum(dist.sf(np.floor(r_plus)),
|
||||
dist.cdf(np.ceil(r_plus)))
|
||||
p = np.clip(p, 0, 1)
|
||||
else: # `PermutationMethod` instance (already validated)
|
||||
p = stats.permutation_test(
|
||||
(d,), lambda d: _wilcoxon_statistic(d, zero_method)[0],
|
||||
permutation_type='samples', **method._asdict(),
|
||||
alternative=alternative, axis=-1).pvalue
|
||||
|
||||
# for backward compatibility...
|
||||
statistic = np.minimum(r_plus, r_minus) if alternative=='two-sided' else r_plus
|
||||
z = -np.abs(z) if (alternative == 'two-sided' and method == 'approx') else z
|
||||
|
||||
res = _morestats.WilcoxonResult(statistic=statistic, pvalue=p[()])
|
||||
if output_z:
|
||||
res.zstatistic = z[()]
|
||||
return res
|
||||
16
venv/lib/python3.12/site-packages/scipy/stats/biasedurn.py
Normal file
16
venv/lib/python3.12/site-packages/scipy/stats/biasedurn.py
Normal file
@ -0,0 +1,16 @@
|
||||
# This file is not meant for public use and will be removed in SciPy v2.0.0.
|
||||
|
||||
from scipy._lib.deprecation import _sub_module_deprecation
|
||||
|
||||
|
||||
__all__: list[str] = []
|
||||
|
||||
|
||||
def __dir__():
|
||||
return __all__
|
||||
|
||||
|
||||
def __getattr__(name):
|
||||
return _sub_module_deprecation(sub_package="stats", module="biasedurn",
|
||||
private_modules=["_biasedurn"], all=__all__,
|
||||
attribute=name)
|
||||
468
venv/lib/python3.12/site-packages/scipy/stats/contingency.py
Normal file
468
venv/lib/python3.12/site-packages/scipy/stats/contingency.py
Normal file
@ -0,0 +1,468 @@
|
||||
"""
|
||||
Contingency table functions (:mod:`scipy.stats.contingency`)
|
||||
============================================================
|
||||
|
||||
Functions for creating and analyzing contingency tables.
|
||||
|
||||
.. currentmodule:: scipy.stats.contingency
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
chi2_contingency
|
||||
relative_risk
|
||||
odds_ratio
|
||||
crosstab
|
||||
association
|
||||
|
||||
expected_freq
|
||||
margins
|
||||
|
||||
"""
|
||||
|
||||
|
||||
from functools import reduce
|
||||
import math
|
||||
import numpy as np
|
||||
from ._stats_py import power_divergence
|
||||
from ._relative_risk import relative_risk
|
||||
from ._crosstab import crosstab
|
||||
from ._odds_ratio import odds_ratio
|
||||
from scipy._lib._bunch import _make_tuple_bunch
|
||||
|
||||
|
||||
__all__ = ['margins', 'expected_freq', 'chi2_contingency', 'crosstab',
|
||||
'association', 'relative_risk', 'odds_ratio']
|
||||
|
||||
|
||||
def margins(a):
|
||||
"""Return a list of the marginal sums of the array `a`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
a : ndarray
|
||||
The array for which to compute the marginal sums.
|
||||
|
||||
Returns
|
||||
-------
|
||||
margsums : list of ndarrays
|
||||
A list of length `a.ndim`. `margsums[k]` is the result
|
||||
of summing `a` over all axes except `k`; it has the same
|
||||
number of dimensions as `a`, but the length of each axis
|
||||
except axis `k` will be 1.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats.contingency import margins
|
||||
|
||||
>>> a = np.arange(12).reshape(2, 6)
|
||||
>>> a
|
||||
array([[ 0, 1, 2, 3, 4, 5],
|
||||
[ 6, 7, 8, 9, 10, 11]])
|
||||
>>> m0, m1 = margins(a)
|
||||
>>> m0
|
||||
array([[15],
|
||||
[51]])
|
||||
>>> m1
|
||||
array([[ 6, 8, 10, 12, 14, 16]])
|
||||
|
||||
>>> b = np.arange(24).reshape(2,3,4)
|
||||
>>> m0, m1, m2 = margins(b)
|
||||
>>> m0
|
||||
array([[[ 66]],
|
||||
[[210]]])
|
||||
>>> m1
|
||||
array([[[ 60],
|
||||
[ 92],
|
||||
[124]]])
|
||||
>>> m2
|
||||
array([[[60, 66, 72, 78]]])
|
||||
"""
|
||||
margsums = []
|
||||
ranged = list(range(a.ndim))
|
||||
for k in ranged:
|
||||
marg = np.apply_over_axes(np.sum, a, [j for j in ranged if j != k])
|
||||
margsums.append(marg)
|
||||
return margsums
|
||||
|
||||
|
||||
def expected_freq(observed):
|
||||
"""
|
||||
Compute the expected frequencies from a contingency table.
|
||||
|
||||
Given an n-dimensional contingency table of observed frequencies,
|
||||
compute the expected frequencies for the table based on the marginal
|
||||
sums under the assumption that the groups associated with each
|
||||
dimension are independent.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
observed : array_like
|
||||
The table of observed frequencies. (While this function can handle
|
||||
a 1-D array, that case is trivial. Generally `observed` is at
|
||||
least 2-D.)
|
||||
|
||||
Returns
|
||||
-------
|
||||
expected : ndarray of float64
|
||||
The expected frequencies, based on the marginal sums of the table.
|
||||
Same shape as `observed`.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats.contingency import expected_freq
|
||||
>>> observed = np.array([[10, 10, 20],[20, 20, 20]])
|
||||
>>> expected_freq(observed)
|
||||
array([[ 12., 12., 16.],
|
||||
[ 18., 18., 24.]])
|
||||
|
||||
"""
|
||||
# Typically `observed` is an integer array. If `observed` has a large
|
||||
# number of dimensions or holds large values, some of the following
|
||||
# computations may overflow, so we first switch to floating point.
|
||||
observed = np.asarray(observed, dtype=np.float64)
|
||||
|
||||
# Create a list of the marginal sums.
|
||||
margsums = margins(observed)
|
||||
|
||||
# Create the array of expected frequencies. The shapes of the
|
||||
# marginal sums returned by apply_over_axes() are just what we
|
||||
# need for broadcasting in the following product.
|
||||
d = observed.ndim
|
||||
expected = reduce(np.multiply, margsums) / observed.sum() ** (d - 1)
|
||||
return expected
|
||||
|
||||
|
||||
Chi2ContingencyResult = _make_tuple_bunch(
|
||||
'Chi2ContingencyResult',
|
||||
['statistic', 'pvalue', 'dof', 'expected_freq'], []
|
||||
)
|
||||
|
||||
|
||||
def chi2_contingency(observed, correction=True, lambda_=None):
|
||||
"""Chi-square test of independence of variables in a contingency table.
|
||||
|
||||
This function computes the chi-square statistic and p-value for the
|
||||
hypothesis test of independence of the observed frequencies in the
|
||||
contingency table [1]_ `observed`. The expected frequencies are computed
|
||||
based on the marginal sums under the assumption of independence; see
|
||||
`scipy.stats.contingency.expected_freq`. The number of degrees of
|
||||
freedom is (expressed using numpy functions and attributes)::
|
||||
|
||||
dof = observed.size - sum(observed.shape) + observed.ndim - 1
|
||||
|
||||
|
||||
Parameters
|
||||
----------
|
||||
observed : array_like
|
||||
The contingency table. The table contains the observed frequencies
|
||||
(i.e. number of occurrences) in each category. In the two-dimensional
|
||||
case, the table is often described as an "R x C table".
|
||||
correction : bool, optional
|
||||
If True, *and* the degrees of freedom is 1, apply Yates' correction
|
||||
for continuity. The effect of the correction is to adjust each
|
||||
observed value by 0.5 towards the corresponding expected value.
|
||||
lambda_ : float or str, optional
|
||||
By default, the statistic computed in this test is Pearson's
|
||||
chi-squared statistic [2]_. `lambda_` allows a statistic from the
|
||||
Cressie-Read power divergence family [3]_ to be used instead. See
|
||||
`scipy.stats.power_divergence` for details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : Chi2ContingencyResult
|
||||
An object containing attributes:
|
||||
|
||||
statistic : float
|
||||
The test statistic.
|
||||
pvalue : float
|
||||
The p-value of the test.
|
||||
dof : int
|
||||
The degrees of freedom.
|
||||
expected_freq : ndarray, same shape as `observed`
|
||||
The expected frequencies, based on the marginal sums of the table.
|
||||
|
||||
See Also
|
||||
--------
|
||||
scipy.stats.contingency.expected_freq
|
||||
scipy.stats.fisher_exact
|
||||
scipy.stats.chisquare
|
||||
scipy.stats.power_divergence
|
||||
scipy.stats.barnard_exact
|
||||
scipy.stats.boschloo_exact
|
||||
|
||||
Notes
|
||||
-----
|
||||
An often quoted guideline for the validity of this calculation is that
|
||||
the test should be used only if the observed and expected frequencies
|
||||
in each cell are at least 5.
|
||||
|
||||
This is a test for the independence of different categories of a
|
||||
population. The test is only meaningful when the dimension of
|
||||
`observed` is two or more. Applying the test to a one-dimensional
|
||||
table will always result in `expected` equal to `observed` and a
|
||||
chi-square statistic equal to 0.
|
||||
|
||||
This function does not handle masked arrays, because the calculation
|
||||
does not make sense with missing values.
|
||||
|
||||
Like `scipy.stats.chisquare`, this function computes a chi-square
|
||||
statistic; the convenience this function provides is to figure out the
|
||||
expected frequencies and degrees of freedom from the given contingency
|
||||
table. If these were already known, and if the Yates' correction was not
|
||||
required, one could use `scipy.stats.chisquare`. That is, if one calls::
|
||||
|
||||
res = chi2_contingency(obs, correction=False)
|
||||
|
||||
then the following is true::
|
||||
|
||||
(res.statistic, res.pvalue) == stats.chisquare(obs.ravel(),
|
||||
f_exp=ex.ravel(),
|
||||
ddof=obs.size - 1 - dof)
|
||||
|
||||
The `lambda_` argument was added in version 0.13.0 of scipy.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] "Contingency table",
|
||||
https://en.wikipedia.org/wiki/Contingency_table
|
||||
.. [2] "Pearson's chi-squared test",
|
||||
https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
|
||||
.. [3] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit
|
||||
Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984),
|
||||
pp. 440-464.
|
||||
.. [4] Berger, Jeffrey S. et al. "Aspirin for the Primary Prevention of
|
||||
Cardiovascular Events in Women and Men: A Sex-Specific
|
||||
Meta-analysis of Randomized Controlled Trials."
|
||||
JAMA, 295(3):306-313, :doi:`10.1001/jama.295.3.306`, 2006.
|
||||
|
||||
Examples
|
||||
--------
|
||||
In [4]_, the use of aspirin to prevent cardiovascular events in women
|
||||
and men was investigated. The study notably concluded:
|
||||
|
||||
...aspirin therapy reduced the risk of a composite of
|
||||
cardiovascular events due to its effect on reducing the risk of
|
||||
ischemic stroke in women [...]
|
||||
|
||||
The article lists studies of various cardiovascular events. Let's
|
||||
focus on the ischemic stoke in women.
|
||||
|
||||
The following table summarizes the results of the experiment in which
|
||||
participants took aspirin or a placebo on a regular basis for several
|
||||
years. Cases of ischemic stroke were recorded::
|
||||
|
||||
Aspirin Control/Placebo
|
||||
Ischemic stroke 176 230
|
||||
No stroke 21035 21018
|
||||
|
||||
Is there evidence that the aspirin reduces the risk of ischemic stroke?
|
||||
We begin by formulating a null hypothesis :math:`H_0`:
|
||||
|
||||
The effect of aspirin is equivalent to that of placebo.
|
||||
|
||||
Let's assess the plausibility of this hypothesis with
|
||||
a chi-square test.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats import chi2_contingency
|
||||
>>> table = np.array([[176, 230], [21035, 21018]])
|
||||
>>> res = chi2_contingency(table)
|
||||
>>> res.statistic
|
||||
6.892569132546561
|
||||
>>> res.pvalue
|
||||
0.008655478161175739
|
||||
|
||||
Using a significance level of 5%, we would reject the null hypothesis in
|
||||
favor of the alternative hypothesis: "the effect of aspirin
|
||||
is not equivalent to the effect of placebo".
|
||||
Because `scipy.stats.contingency.chi2_contingency` performs a two-sided
|
||||
test, the alternative hypothesis does not indicate the direction of the
|
||||
effect. We can use `stats.contingency.odds_ratio` to support the
|
||||
conclusion that aspirin *reduces* the risk of ischemic stroke.
|
||||
|
||||
Below are further examples showing how larger contingency tables can be
|
||||
tested.
|
||||
|
||||
A two-way example (2 x 3):
|
||||
|
||||
>>> obs = np.array([[10, 10, 20], [20, 20, 20]])
|
||||
>>> res = chi2_contingency(obs)
|
||||
>>> res.statistic
|
||||
2.7777777777777777
|
||||
>>> res.pvalue
|
||||
0.24935220877729619
|
||||
>>> res.dof
|
||||
2
|
||||
>>> res.expected_freq
|
||||
array([[ 12., 12., 16.],
|
||||
[ 18., 18., 24.]])
|
||||
|
||||
Perform the test using the log-likelihood ratio (i.e. the "G-test")
|
||||
instead of Pearson's chi-squared statistic.
|
||||
|
||||
>>> res = chi2_contingency(obs, lambda_="log-likelihood")
|
||||
>>> res.statistic
|
||||
2.7688587616781319
|
||||
>>> res.pvalue
|
||||
0.25046668010954165
|
||||
|
||||
A four-way example (2 x 2 x 2 x 2):
|
||||
|
||||
>>> obs = np.array(
|
||||
... [[[[12, 17],
|
||||
... [11, 16]],
|
||||
... [[11, 12],
|
||||
... [15, 16]]],
|
||||
... [[[23, 15],
|
||||
... [30, 22]],
|
||||
... [[14, 17],
|
||||
... [15, 16]]]])
|
||||
>>> res = chi2_contingency(obs)
|
||||
>>> res.statistic
|
||||
8.7584514426741897
|
||||
>>> res.pvalue
|
||||
0.64417725029295503
|
||||
"""
|
||||
observed = np.asarray(observed)
|
||||
if np.any(observed < 0):
|
||||
raise ValueError("All values in `observed` must be nonnegative.")
|
||||
if observed.size == 0:
|
||||
raise ValueError("No data; `observed` has size 0.")
|
||||
|
||||
expected = expected_freq(observed)
|
||||
if np.any(expected == 0):
|
||||
# Include one of the positions where expected is zero in
|
||||
# the exception message.
|
||||
zeropos = list(zip(*np.nonzero(expected == 0)))[0]
|
||||
raise ValueError("The internally computed table of expected "
|
||||
f"frequencies has a zero element at {zeropos}.")
|
||||
|
||||
# The degrees of freedom
|
||||
dof = expected.size - sum(expected.shape) + expected.ndim - 1
|
||||
|
||||
if dof == 0:
|
||||
# Degenerate case; this occurs when `observed` is 1D (or, more
|
||||
# generally, when it has only one nontrivial dimension). In this
|
||||
# case, we also have observed == expected, so chi2 is 0.
|
||||
chi2 = 0.0
|
||||
p = 1.0
|
||||
else:
|
||||
if dof == 1 and correction:
|
||||
# Adjust `observed` according to Yates' correction for continuity.
|
||||
# Magnitude of correction no bigger than difference; see gh-13875
|
||||
diff = expected - observed
|
||||
direction = np.sign(diff)
|
||||
magnitude = np.minimum(0.5, np.abs(diff))
|
||||
observed = observed + magnitude * direction
|
||||
|
||||
chi2, p = power_divergence(observed, expected,
|
||||
ddof=observed.size - 1 - dof, axis=None,
|
||||
lambda_=lambda_)
|
||||
|
||||
return Chi2ContingencyResult(chi2, p, dof, expected)
|
||||
|
||||
|
||||
def association(observed, method="cramer", correction=False, lambda_=None):
|
||||
"""Calculates degree of association between two nominal variables.
|
||||
|
||||
The function provides the option for computing one of three measures of
|
||||
association between two nominal variables from the data given in a 2d
|
||||
contingency table: Tschuprow's T, Pearson's Contingency Coefficient
|
||||
and Cramer's V.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
observed : array-like
|
||||
The array of observed values
|
||||
method : {"cramer", "tschuprow", "pearson"} (default = "cramer")
|
||||
The association test statistic.
|
||||
correction : bool, optional
|
||||
Inherited from `scipy.stats.contingency.chi2_contingency()`
|
||||
lambda_ : float or str, optional
|
||||
Inherited from `scipy.stats.contingency.chi2_contingency()`
|
||||
|
||||
Returns
|
||||
-------
|
||||
statistic : float
|
||||
Value of the test statistic
|
||||
|
||||
Notes
|
||||
-----
|
||||
Cramer's V, Tschuprow's T and Pearson's Contingency Coefficient, all
|
||||
measure the degree to which two nominal or ordinal variables are related,
|
||||
or the level of their association. This differs from correlation, although
|
||||
many often mistakenly consider them equivalent. Correlation measures in
|
||||
what way two variables are related, whereas, association measures how
|
||||
related the variables are. As such, association does not subsume
|
||||
independent variables, and is rather a test of independence. A value of
|
||||
1.0 indicates perfect association, and 0.0 means the variables have no
|
||||
association.
|
||||
|
||||
Both the Cramer's V and Tschuprow's T are extensions of the phi
|
||||
coefficient. Moreover, due to the close relationship between the
|
||||
Cramer's V and Tschuprow's T the returned values can often be similar
|
||||
or even equivalent. They are likely to diverge more as the array shape
|
||||
diverges from a 2x2.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] "Tschuprow's T",
|
||||
https://en.wikipedia.org/wiki/Tschuprow's_T
|
||||
.. [2] Tschuprow, A. A. (1939)
|
||||
Principles of the Mathematical Theory of Correlation;
|
||||
translated by M. Kantorowitsch. W. Hodge & Co.
|
||||
.. [3] "Cramer's V", https://en.wikipedia.org/wiki/Cramer's_V
|
||||
.. [4] "Nominal Association: Phi and Cramer's V",
|
||||
http://www.people.vcu.edu/~pdattalo/702SuppRead/MeasAssoc/NominalAssoc.html
|
||||
.. [5] Gingrich, Paul, "Association Between Variables",
|
||||
http://uregina.ca/~gingrich/ch11a.pdf
|
||||
|
||||
Examples
|
||||
--------
|
||||
An example with a 4x2 contingency table:
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy.stats.contingency import association
|
||||
>>> obs4x2 = np.array([[100, 150], [203, 322], [420, 700], [320, 210]])
|
||||
|
||||
Pearson's contingency coefficient
|
||||
|
||||
>>> association(obs4x2, method="pearson")
|
||||
0.18303298140595667
|
||||
|
||||
Cramer's V
|
||||
|
||||
>>> association(obs4x2, method="cramer")
|
||||
0.18617813077483678
|
||||
|
||||
Tschuprow's T
|
||||
|
||||
>>> association(obs4x2, method="tschuprow")
|
||||
0.14146478765062995
|
||||
"""
|
||||
arr = np.asarray(observed)
|
||||
if not np.issubdtype(arr.dtype, np.integer):
|
||||
raise ValueError("`observed` must be an integer array.")
|
||||
|
||||
if len(arr.shape) != 2:
|
||||
raise ValueError("method only accepts 2d arrays")
|
||||
|
||||
chi2_stat = chi2_contingency(arr, correction=correction,
|
||||
lambda_=lambda_)
|
||||
|
||||
phi2 = chi2_stat.statistic / arr.sum()
|
||||
n_rows, n_cols = arr.shape
|
||||
if method == "cramer":
|
||||
value = phi2 / min(n_cols - 1, n_rows - 1)
|
||||
elif method == "tschuprow":
|
||||
value = phi2 / math.sqrt((n_rows - 1) * (n_cols - 1))
|
||||
elif method == 'pearson':
|
||||
value = phi2 / (1 + phi2)
|
||||
else:
|
||||
raise ValueError("Invalid argument value: 'method' argument must "
|
||||
"be 'cramer', 'tschuprow', or 'pearson'")
|
||||
|
||||
return math.sqrt(value)
|
||||
@ -0,0 +1,24 @@
|
||||
#
|
||||
# Author: Travis Oliphant 2002-2011 with contributions from
|
||||
# SciPy Developers 2004-2011
|
||||
#
|
||||
# NOTE: To look at history using `git blame`, use `git blame -M -C -C`
|
||||
# instead of `git blame -Lxxx,+x`.
|
||||
#
|
||||
from ._distn_infrastructure import (rv_discrete, rv_continuous, rv_frozen) # noqa: F401
|
||||
|
||||
from . import _continuous_distns
|
||||
from . import _discrete_distns
|
||||
|
||||
from ._continuous_distns import * # noqa: F403
|
||||
from ._levy_stable import levy_stable
|
||||
from ._discrete_distns import * # noqa: F403
|
||||
from ._entropy import entropy
|
||||
|
||||
# For backwards compatibility e.g. pymc expects distributions.__all__.
|
||||
__all__ = ['rv_discrete', 'rv_continuous', 'rv_histogram', 'entropy'] # noqa: F405
|
||||
|
||||
# Add only the distribution names, not the *_gen names.
|
||||
__all__ += _continuous_distns._distn_names
|
||||
__all__ += ['levy_stable']
|
||||
__all__ += _discrete_distns._distn_names
|
||||
18
venv/lib/python3.12/site-packages/scipy/stats/kde.py
Normal file
18
venv/lib/python3.12/site-packages/scipy/stats/kde.py
Normal file
@ -0,0 +1,18 @@
|
||||
# This file is not meant for public use and will be removed in SciPy v2.0.0.
|
||||
# Use the `scipy.stats` namespace for importing the functions
|
||||
# included below.
|
||||
|
||||
from scipy._lib.deprecation import _sub_module_deprecation
|
||||
|
||||
|
||||
__all__ = ["gaussian_kde"] # noqa: F822
|
||||
|
||||
|
||||
def __dir__():
|
||||
return __all__
|
||||
|
||||
|
||||
def __getattr__(name):
|
||||
return _sub_module_deprecation(sub_package="stats", module="kde",
|
||||
private_modules=["_kde"], all=__all__,
|
||||
attribute=name)
|
||||
27
venv/lib/python3.12/site-packages/scipy/stats/morestats.py
Normal file
27
venv/lib/python3.12/site-packages/scipy/stats/morestats.py
Normal file
@ -0,0 +1,27 @@
|
||||
# This file is not meant for public use and will be removed in SciPy v2.0.0.
|
||||
# Use the `scipy.stats` namespace for importing the functions
|
||||
# included below.
|
||||
|
||||
from scipy._lib.deprecation import _sub_module_deprecation
|
||||
|
||||
|
||||
__all__ = [ # noqa: F822
|
||||
'mvsdist',
|
||||
'bayes_mvs', 'kstat', 'kstatvar', 'probplot', 'ppcc_max', 'ppcc_plot',
|
||||
'boxcox_llf', 'boxcox', 'boxcox_normmax', 'boxcox_normplot',
|
||||
'shapiro', 'anderson', 'ansari', 'bartlett', 'levene',
|
||||
'fligner', 'mood', 'wilcoxon', 'median_test',
|
||||
'circmean', 'circvar', 'circstd', 'anderson_ksamp',
|
||||
'yeojohnson_llf', 'yeojohnson', 'yeojohnson_normmax',
|
||||
'yeojohnson_normplot', 'find_repeats', 'chi2_contingency', 'distributions',
|
||||
]
|
||||
|
||||
|
||||
def __dir__():
|
||||
return __all__
|
||||
|
||||
|
||||
def __getattr__(name):
|
||||
return _sub_module_deprecation(sub_package="stats", module="morestats",
|
||||
private_modules=["_morestats"], all=__all__,
|
||||
attribute=name)
|
||||
140
venv/lib/python3.12/site-packages/scipy/stats/mstats.py
Normal file
140
venv/lib/python3.12/site-packages/scipy/stats/mstats.py
Normal file
@ -0,0 +1,140 @@
|
||||
"""
|
||||
===================================================================
|
||||
Statistical functions for masked arrays (:mod:`scipy.stats.mstats`)
|
||||
===================================================================
|
||||
|
||||
.. currentmodule:: scipy.stats.mstats
|
||||
|
||||
This module contains a large number of statistical functions that can
|
||||
be used with masked arrays.
|
||||
|
||||
Most of these functions are similar to those in `scipy.stats` but might
|
||||
have small differences in the API or in the algorithm used. Since this
|
||||
is a relatively new package, some API changes are still possible.
|
||||
|
||||
Summary statistics
|
||||
==================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
describe
|
||||
gmean
|
||||
hmean
|
||||
kurtosis
|
||||
mode
|
||||
mquantiles
|
||||
hdmedian
|
||||
hdquantiles
|
||||
hdquantiles_sd
|
||||
idealfourths
|
||||
plotting_positions
|
||||
meppf
|
||||
moment
|
||||
skew
|
||||
tmean
|
||||
tvar
|
||||
tmin
|
||||
tmax
|
||||
tsem
|
||||
variation
|
||||
find_repeats
|
||||
sem
|
||||
trimmed_mean
|
||||
trimmed_mean_ci
|
||||
trimmed_std
|
||||
trimmed_var
|
||||
|
||||
Frequency statistics
|
||||
====================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
scoreatpercentile
|
||||
|
||||
Correlation functions
|
||||
=====================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
f_oneway
|
||||
pearsonr
|
||||
spearmanr
|
||||
pointbiserialr
|
||||
kendalltau
|
||||
kendalltau_seasonal
|
||||
linregress
|
||||
siegelslopes
|
||||
theilslopes
|
||||
sen_seasonal_slopes
|
||||
|
||||
Statistical tests
|
||||
=================
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
ttest_1samp
|
||||
ttest_onesamp
|
||||
ttest_ind
|
||||
ttest_rel
|
||||
chisquare
|
||||
kstest
|
||||
ks_2samp
|
||||
ks_1samp
|
||||
ks_twosamp
|
||||
mannwhitneyu
|
||||
rankdata
|
||||
kruskal
|
||||
kruskalwallis
|
||||
friedmanchisquare
|
||||
brunnermunzel
|
||||
skewtest
|
||||
kurtosistest
|
||||
normaltest
|
||||
|
||||
Transformations
|
||||
===============
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
obrientransform
|
||||
trim
|
||||
trima
|
||||
trimmed_stde
|
||||
trimr
|
||||
trimtail
|
||||
trimboth
|
||||
winsorize
|
||||
zmap
|
||||
zscore
|
||||
|
||||
Other
|
||||
=====
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
argstoarray
|
||||
count_tied_groups
|
||||
msign
|
||||
compare_medians_ms
|
||||
median_cihs
|
||||
mjci
|
||||
mquantiles_cimj
|
||||
rsh
|
||||
|
||||
"""
|
||||
from . import _mstats_basic
|
||||
from . import _mstats_extras
|
||||
from ._mstats_basic import * # noqa: F403
|
||||
from ._mstats_extras import * # noqa: F403
|
||||
# Functions that support masked array input in stats but need to be kept in the
|
||||
# mstats namespace for backwards compatibility:
|
||||
from scipy.stats import gmean, hmean, zmap, zscore, chisquare
|
||||
|
||||
__all__ = _mstats_basic.__all__ + _mstats_extras.__all__
|
||||
__all__ += ['gmean', 'hmean', 'zmap', 'zscore', 'chisquare']
|
||||
@ -0,0 +1,42 @@
|
||||
# This file is not meant for public use and will be removed in SciPy v2.0.0.
|
||||
# Use the `scipy.stats` namespace for importing the functions
|
||||
# included below.
|
||||
|
||||
from scipy._lib.deprecation import _sub_module_deprecation
|
||||
|
||||
|
||||
__all__ = [ # noqa: F822
|
||||
'argstoarray',
|
||||
'count_tied_groups',
|
||||
'describe',
|
||||
'f_oneway', 'find_repeats','friedmanchisquare',
|
||||
'kendalltau','kendalltau_seasonal','kruskal','kruskalwallis',
|
||||
'ks_twosamp', 'ks_2samp', 'kurtosis', 'kurtosistest',
|
||||
'ks_1samp', 'kstest',
|
||||
'linregress',
|
||||
'mannwhitneyu', 'meppf','mode','moment','mquantiles','msign',
|
||||
'normaltest',
|
||||
'obrientransform',
|
||||
'pearsonr','plotting_positions','pointbiserialr',
|
||||
'rankdata',
|
||||
'scoreatpercentile','sem',
|
||||
'sen_seasonal_slopes','skew','skewtest','spearmanr',
|
||||
'siegelslopes', 'theilslopes',
|
||||
'tmax','tmean','tmin','trim','trimboth',
|
||||
'trimtail','trima','trimr','trimmed_mean','trimmed_std',
|
||||
'trimmed_stde','trimmed_var','tsem','ttest_1samp','ttest_onesamp',
|
||||
'ttest_ind','ttest_rel','tvar',
|
||||
'variation',
|
||||
'winsorize',
|
||||
'brunnermunzel',
|
||||
]
|
||||
|
||||
|
||||
def __dir__():
|
||||
return __all__
|
||||
|
||||
|
||||
def __getattr__(name):
|
||||
return _sub_module_deprecation(sub_package="stats", module="mstats_basic",
|
||||
private_modules=["_mstats_basic"], all=__all__,
|
||||
attribute=name, correct_module="mstats")
|
||||
@ -0,0 +1,25 @@
|
||||
# This file is not meant for public use and will be removed in SciPy v2.0.0.
|
||||
# Use the `scipy.stats` namespace for importing the functions
|
||||
# included below.
|
||||
|
||||
from scipy._lib.deprecation import _sub_module_deprecation
|
||||
|
||||
|
||||
__all__ = [ # noqa: F822
|
||||
'compare_medians_ms',
|
||||
'hdquantiles', 'hdmedian', 'hdquantiles_sd',
|
||||
'idealfourths',
|
||||
'median_cihs','mjci','mquantiles_cimj',
|
||||
'rsh',
|
||||
'trimmed_mean_ci',
|
||||
]
|
||||
|
||||
|
||||
def __dir__():
|
||||
return __all__
|
||||
|
||||
|
||||
def __getattr__(name):
|
||||
return _sub_module_deprecation(sub_package="stats", module="mstats_extras",
|
||||
private_modules=["_mstats_extras"], all=__all__,
|
||||
attribute=name, correct_module="mstats")
|
||||
17
venv/lib/python3.12/site-packages/scipy/stats/mvn.py
Normal file
17
venv/lib/python3.12/site-packages/scipy/stats/mvn.py
Normal file
@ -0,0 +1,17 @@
|
||||
# This file is not meant for public use and will be removed in SciPy v2.0.0.
|
||||
# Use the `scipy.stats` namespace for importing the functions
|
||||
# included below.
|
||||
|
||||
from scipy._lib.deprecation import _sub_module_deprecation
|
||||
|
||||
__all__: list[str] = []
|
||||
|
||||
|
||||
def __dir__():
|
||||
return __all__
|
||||
|
||||
|
||||
def __getattr__(name):
|
||||
return _sub_module_deprecation(sub_package="stats", module="mvn",
|
||||
private_modules=["_mvn"], all=__all__,
|
||||
attribute=name)
|
||||
236
venv/lib/python3.12/site-packages/scipy/stats/qmc.py
Normal file
236
venv/lib/python3.12/site-packages/scipy/stats/qmc.py
Normal file
@ -0,0 +1,236 @@
|
||||
r"""
|
||||
====================================================
|
||||
Quasi-Monte Carlo submodule (:mod:`scipy.stats.qmc`)
|
||||
====================================================
|
||||
|
||||
.. currentmodule:: scipy.stats.qmc
|
||||
|
||||
This module provides Quasi-Monte Carlo generators and associated helper
|
||||
functions.
|
||||
|
||||
|
||||
Quasi-Monte Carlo
|
||||
=================
|
||||
|
||||
Engines
|
||||
-------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
QMCEngine
|
||||
Sobol
|
||||
Halton
|
||||
LatinHypercube
|
||||
PoissonDisk
|
||||
MultinomialQMC
|
||||
MultivariateNormalQMC
|
||||
|
||||
Helpers
|
||||
-------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
discrepancy
|
||||
geometric_discrepancy
|
||||
update_discrepancy
|
||||
scale
|
||||
|
||||
|
||||
Introduction to Quasi-Monte Carlo
|
||||
=================================
|
||||
|
||||
Quasi-Monte Carlo (QMC) methods [1]_, [2]_, [3]_ provide an
|
||||
:math:`n \times d` array of numbers in :math:`[0,1]`. They can be used in
|
||||
place of :math:`n` points from the :math:`U[0,1]^{d}` distribution. Compared to
|
||||
random points, QMC points are designed to have fewer gaps and clumps. This is
|
||||
quantified by discrepancy measures [4]_. From the Koksma-Hlawka
|
||||
inequality [5]_ we know that low discrepancy reduces a bound on
|
||||
integration error. Averaging a function :math:`f` over :math:`n` QMC points
|
||||
can achieve an integration error close to :math:`O(n^{-1})` for well
|
||||
behaved functions [2]_.
|
||||
|
||||
Most QMC constructions are designed for special values of :math:`n`
|
||||
such as powers of 2 or large primes. Changing the sample
|
||||
size by even one can degrade their performance, even their
|
||||
rate of convergence [6]_. For instance :math:`n=100` points may give less
|
||||
accuracy than :math:`n=64` if the method was designed for :math:`n=2^m`.
|
||||
|
||||
Some QMC constructions are extensible in :math:`n`: we can find
|
||||
another special sample size :math:`n' > n` and often an infinite
|
||||
sequence of increasing special sample sizes. Some QMC
|
||||
constructions are extensible in :math:`d`: we can increase the dimension,
|
||||
possibly to some upper bound, and typically without requiring
|
||||
special values of :math:`d`. Some QMC methods are extensible in
|
||||
both :math:`n` and :math:`d`.
|
||||
|
||||
QMC points are deterministic. That makes it hard to estimate the accuracy of
|
||||
integrals estimated by averages over QMC points. Randomized QMC (RQMC) [7]_
|
||||
points are constructed so that each point is individually :math:`U[0,1]^{d}`
|
||||
while collectively the :math:`n` points retain their low discrepancy.
|
||||
One can make :math:`R` independent replications of RQMC points to
|
||||
see how stable a computation is. From :math:`R` independent values,
|
||||
a t-test (or bootstrap t-test [8]_) then gives approximate confidence
|
||||
intervals on the mean value. Some RQMC methods produce a
|
||||
root mean squared error that is actually :math:`o(1/n)` and smaller than
|
||||
the rate seen in unrandomized QMC. An intuitive explanation is
|
||||
that the error is a sum of many small ones and random errors
|
||||
cancel in a way that deterministic ones do not. RQMC also
|
||||
has advantages on integrands that are singular or, for other
|
||||
reasons, fail to be Riemann integrable.
|
||||
|
||||
(R)QMC cannot beat Bahkvalov's curse of dimension (see [9]_). For
|
||||
any random or deterministic method, there are worst case functions
|
||||
that will give it poor performance in high dimensions. A worst
|
||||
case function for QMC might be 0 at all n points but very
|
||||
large elsewhere. Worst case analyses get very pessimistic
|
||||
in high dimensions. (R)QMC can bring a great improvement over
|
||||
MC when the functions on which it is used are not worst case.
|
||||
For instance (R)QMC can be especially effective on integrands
|
||||
that are well approximated by sums of functions of
|
||||
some small number of their input variables at a time [10]_, [11]_.
|
||||
That property is often a surprising finding about those functions.
|
||||
|
||||
Also, to see an improvement over IID MC, (R)QMC requires a bit of smoothness of
|
||||
the integrand, roughly the mixed first order derivative in each direction,
|
||||
:math:`\partial^d f/\partial x_1 \cdots \partial x_d`, must be integral.
|
||||
For instance, a function that is 1 inside the hypersphere and 0 outside of it
|
||||
has infinite variation in the sense of Hardy and Krause for any dimension
|
||||
:math:`d = 2`.
|
||||
|
||||
Scrambled nets are a kind of RQMC that have some valuable robustness
|
||||
properties [12]_. If the integrand is square integrable, they give variance
|
||||
:math:`var_{SNET} = o(1/n)`. There is a finite upper bound on
|
||||
:math:`var_{SNET} / var_{MC}` that holds simultaneously for every square
|
||||
integrable integrand. Scrambled nets satisfy a strong law of large numbers
|
||||
for :math:`f` in :math:`L^p` when :math:`p>1`. In some
|
||||
special cases there is a central limit theorem [13]_. For smooth enough
|
||||
integrands they can achieve RMSE nearly :math:`O(n^{-3})`. See [12]_
|
||||
for references about these properties.
|
||||
|
||||
The main kinds of QMC methods are lattice rules [14]_ and digital
|
||||
nets and sequences [2]_, [15]_. The theories meet up in polynomial
|
||||
lattice rules [16]_ which can produce digital nets. Lattice rules
|
||||
require some form of search for good constructions. For digital
|
||||
nets there are widely used default constructions.
|
||||
|
||||
The most widely used QMC methods are Sobol' sequences [17]_.
|
||||
These are digital nets. They are extensible in both :math:`n` and :math:`d`.
|
||||
They can be scrambled. The special sample sizes are powers
|
||||
of 2. Another popular method are Halton sequences [18]_.
|
||||
The constructions resemble those of digital nets. The earlier
|
||||
dimensions have much better equidistribution properties than
|
||||
later ones. There are essentially no special sample sizes.
|
||||
They are not thought to be as accurate as Sobol' sequences.
|
||||
They can be scrambled. The nets of Faure [19]_ are also widely
|
||||
used. All dimensions are equally good, but the special sample
|
||||
sizes grow rapidly with dimension :math:`d`. They can be scrambled.
|
||||
The nets of Niederreiter and Xing [20]_ have the best asymptotic
|
||||
properties but have not shown good empirical performance [21]_.
|
||||
|
||||
Higher order digital nets are formed by a digit interleaving process
|
||||
in the digits of the constructed points. They can achieve higher
|
||||
levels of asymptotic accuracy given higher smoothness conditions on :math:`f`
|
||||
and they can be scrambled [22]_. There is little or no empirical work
|
||||
showing the improved rate to be attained.
|
||||
|
||||
Using QMC is like using the entire period of a small random
|
||||
number generator. The constructions are similar and so
|
||||
therefore are the computational costs [23]_.
|
||||
|
||||
(R)QMC is sometimes improved by passing the points through
|
||||
a baker's transformation (tent function) prior to using them.
|
||||
That function has the form :math:`1-2|x-1/2|`. As :math:`x` goes from 0 to
|
||||
1, this function goes from 0 to 1 and then back. It is very
|
||||
useful to produce a periodic function for lattice rules [14]_,
|
||||
and sometimes it improves the convergence rate [24]_.
|
||||
|
||||
It is not straightforward to apply QMC methods to Markov
|
||||
chain Monte Carlo (MCMC). We can think of MCMC as using
|
||||
:math:`n=1` point in :math:`[0,1]^{d}` for very large :math:`d`, with
|
||||
ergodic results corresponding to :math:`d \to \infty`. One proposal is
|
||||
in [25]_ and under strong conditions an improved rate of convergence
|
||||
has been shown [26]_.
|
||||
|
||||
Returning to Sobol' points: there are many versions depending
|
||||
on what are called direction numbers. Those are the result of
|
||||
searches and are tabulated. A very widely used set of direction
|
||||
numbers come from [27]_. It is extensible in dimension up to
|
||||
:math:`d=21201`.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Owen, Art B. "Monte Carlo Book: the Quasi-Monte Carlo parts." 2019.
|
||||
.. [2] Niederreiter, Harald. "Random number generation and quasi-Monte Carlo
|
||||
methods." Society for Industrial and Applied Mathematics, 1992.
|
||||
.. [3] Dick, Josef, Frances Y. Kuo, and Ian H. Sloan. "High-dimensional
|
||||
integration: the quasi-Monte Carlo way." Acta Numerica no. 22: 133, 2013.
|
||||
.. [4] Aho, A. V., C. Aistleitner, T. Anderson, K. Appel, V. Arnol'd, N.
|
||||
Aronszajn, D. Asotsky et al. "W. Chen et al.(eds.), "A Panorama of
|
||||
Discrepancy Theory", Sringer International Publishing,
|
||||
Switzerland: 679, 2014.
|
||||
.. [5] Hickernell, Fred J. "Koksma-Hlawka Inequality." Wiley StatsRef:
|
||||
Statistics Reference Online, 2014.
|
||||
.. [6] Owen, Art B. "On dropping the first Sobol' point." :arxiv:`2008.08051`,
|
||||
2020.
|
||||
.. [7] L'Ecuyer, Pierre, and Christiane Lemieux. "Recent advances in randomized
|
||||
quasi-Monte Carlo methods." In Modeling uncertainty, pp. 419-474. Springer,
|
||||
New York, NY, 2002.
|
||||
.. [8] DiCiccio, Thomas J., and Bradley Efron. "Bootstrap confidence
|
||||
intervals." Statistical science: 189-212, 1996.
|
||||
.. [9] Dimov, Ivan T. "Monte Carlo methods for applied scientists." World
|
||||
Scientific, 2008.
|
||||
.. [10] Caflisch, Russel E., William J. Morokoff, and Art B. Owen. "Valuation
|
||||
of mortgage backed securities using Brownian bridges to reduce effective
|
||||
dimension." Journal of Computational Finance: no. 1 27-46, 1997.
|
||||
.. [11] Sloan, Ian H., and Henryk Wozniakowski. "When are quasi-Monte Carlo
|
||||
algorithms efficient for high dimensional integrals?." Journal of Complexity
|
||||
14, no. 1 (1998): 1-33.
|
||||
.. [12] Owen, Art B., and Daniel Rudolf, "A strong law of large numbers for
|
||||
scrambled net integration." SIAM Review, to appear.
|
||||
.. [13] Loh, Wei-Liem. "On the asymptotic distribution of scrambled net
|
||||
quadrature." The Annals of Statistics 31, no. 4: 1282-1324, 2003.
|
||||
.. [14] Sloan, Ian H. and S. Joe. "Lattice methods for multiple integration."
|
||||
Oxford University Press, 1994.
|
||||
.. [15] Dick, Josef, and Friedrich Pillichshammer. "Digital nets and sequences:
|
||||
discrepancy theory and quasi-Monte Carlo integration." Cambridge University
|
||||
Press, 2010.
|
||||
.. [16] Dick, Josef, F. Kuo, Friedrich Pillichshammer, and I. Sloan.
|
||||
"Construction algorithms for polynomial lattice rules for multivariate
|
||||
integration." Mathematics of computation 74, no. 252: 1895-1921, 2005.
|
||||
.. [17] Sobol', Il'ya Meerovich. "On the distribution of points in a cube and
|
||||
the approximate evaluation of integrals." Zhurnal Vychislitel'noi Matematiki
|
||||
i Matematicheskoi Fiziki 7, no. 4: 784-802, 1967.
|
||||
.. [18] Halton, John H. "On the efficiency of certain quasi-random sequences of
|
||||
points in evaluating multi-dimensional integrals." Numerische Mathematik 2,
|
||||
no. 1: 84-90, 1960.
|
||||
.. [19] Faure, Henri. "Discrepance de suites associees a un systeme de
|
||||
numeration (en dimension s)." Acta arithmetica 41, no. 4: 337-351, 1982.
|
||||
.. [20] Niederreiter, Harold, and Chaoping Xing. "Low-discrepancy sequences and
|
||||
global function fields with many rational places." Finite Fields and their
|
||||
applications 2, no. 3: 241-273, 1996.
|
||||
.. [21] Hong, Hee Sun, and Fred J. Hickernell. "Algorithm 823: Implementing
|
||||
scrambled digital sequences." ACM Transactions on Mathematical Software
|
||||
(TOMS) 29, no. 2: 95-109, 2003.
|
||||
.. [22] Dick, Josef. "Higher order scrambled digital nets achieve the optimal
|
||||
rate of the root mean square error for smooth integrands." The Annals of
|
||||
Statistics 39, no. 3: 1372-1398, 2011.
|
||||
.. [23] Niederreiter, Harald. "Multidimensional numerical integration using
|
||||
pseudorandom numbers." In Stochastic Programming 84 Part I, pp. 17-38.
|
||||
Springer, Berlin, Heidelberg, 1986.
|
||||
.. [24] Hickernell, Fred J. "Obtaining O (N-2+e) Convergence for Lattice
|
||||
Quadrature Rules." In Monte Carlo and Quasi-Monte Carlo Methods 2000,
|
||||
pp. 274-289. Springer, Berlin, Heidelberg, 2002.
|
||||
.. [25] Owen, Art B., and Seth D. Tribble. "A quasi-Monte Carlo Metropolis
|
||||
algorithm." Proceedings of the National Academy of Sciences 102,
|
||||
no. 25: 8844-8849, 2005.
|
||||
.. [26] Chen, Su. "Consistency and convergence rate of Markov chain quasi Monte
|
||||
Carlo with examples." PhD diss., Stanford University, 2011.
|
||||
.. [27] Joe, Stephen, and Frances Y. Kuo. "Constructing Sobol sequences with
|
||||
better two-dimensional projections." SIAM Journal on Scientific Computing
|
||||
30, no. 5: 2635-2654, 2008.
|
||||
|
||||
"""
|
||||
from ._qmc import * # noqa: F403
|
||||
from ._qmc import __all__ # noqa: F401
|
||||
73
venv/lib/python3.12/site-packages/scipy/stats/sampling.py
Normal file
73
venv/lib/python3.12/site-packages/scipy/stats/sampling.py
Normal file
@ -0,0 +1,73 @@
|
||||
"""
|
||||
======================================================
|
||||
Random Number Generators (:mod:`scipy.stats.sampling`)
|
||||
======================================================
|
||||
|
||||
.. currentmodule:: scipy.stats.sampling
|
||||
|
||||
This module contains a collection of random number generators to sample
|
||||
from univariate continuous and discrete distributions. It uses the
|
||||
implementation of a C library called "UNU.RAN". The only exception is
|
||||
RatioUniforms, which is a pure Python implementation of the
|
||||
Ratio-of-Uniforms method.
|
||||
|
||||
Generators Wrapped
|
||||
==================
|
||||
|
||||
For continuous distributions
|
||||
----------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
NumericalInverseHermite
|
||||
NumericalInversePolynomial
|
||||
TransformedDensityRejection
|
||||
SimpleRatioUniforms
|
||||
RatioUniforms
|
||||
|
||||
For discrete distributions
|
||||
--------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
DiscreteAliasUrn
|
||||
DiscreteGuideTable
|
||||
|
||||
Warnings / Errors used in :mod:`scipy.stats.sampling`
|
||||
-----------------------------------------------------
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
UNURANError
|
||||
|
||||
|
||||
Generators for pre-defined distributions
|
||||
========================================
|
||||
|
||||
To easily apply the above methods for some of the continuous distributions
|
||||
in :mod:`scipy.stats`, the following functionality can be used:
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
FastGeneratorInversion
|
||||
|
||||
"""
|
||||
from ._sampling import FastGeneratorInversion, RatioUniforms # noqa: F401
|
||||
from ._unuran.unuran_wrapper import ( # noqa: F401
|
||||
TransformedDensityRejection,
|
||||
DiscreteAliasUrn,
|
||||
DiscreteGuideTable,
|
||||
NumericalInversePolynomial,
|
||||
NumericalInverseHermite,
|
||||
SimpleRatioUniforms,
|
||||
UNURANError
|
||||
)
|
||||
|
||||
__all__ = ["NumericalInverseHermite", "NumericalInversePolynomial",
|
||||
"TransformedDensityRejection", "SimpleRatioUniforms",
|
||||
"RatioUniforms", "DiscreteAliasUrn", "DiscreteGuideTable",
|
||||
"UNURANError", "FastGeneratorInversion"]
|
||||
41
venv/lib/python3.12/site-packages/scipy/stats/stats.py
Normal file
41
venv/lib/python3.12/site-packages/scipy/stats/stats.py
Normal file
@ -0,0 +1,41 @@
|
||||
# This file is not meant for public use and will be removed in SciPy v2.0.0.
|
||||
# Use the `scipy.stats` namespace for importing the functions
|
||||
# included below.
|
||||
|
||||
from scipy._lib.deprecation import _sub_module_deprecation
|
||||
|
||||
|
||||
__all__ = [ # noqa: F822
|
||||
'find_repeats', 'gmean', 'hmean', 'pmean', 'mode', 'tmean', 'tvar',
|
||||
'tmin', 'tmax', 'tstd', 'tsem', 'moment',
|
||||
'skew', 'kurtosis', 'describe', 'skewtest', 'kurtosistest',
|
||||
'normaltest', 'jarque_bera',
|
||||
'scoreatpercentile', 'percentileofscore',
|
||||
'cumfreq', 'relfreq', 'obrientransform',
|
||||
'sem', 'zmap', 'zscore', 'gzscore', 'iqr', 'gstd',
|
||||
'median_abs_deviation',
|
||||
'sigmaclip', 'trimboth', 'trim1', 'trim_mean',
|
||||
'f_oneway',
|
||||
'pearsonr', 'fisher_exact',
|
||||
'spearmanr', 'pointbiserialr',
|
||||
'kendalltau', 'weightedtau', 'multiscale_graphcorr',
|
||||
'linregress', 'siegelslopes', 'theilslopes', 'ttest_1samp',
|
||||
'ttest_ind', 'ttest_ind_from_stats', 'ttest_rel',
|
||||
'kstest', 'ks_1samp', 'ks_2samp',
|
||||
'chisquare', 'power_divergence',
|
||||
'tiecorrect', 'ranksums', 'kruskal', 'friedmanchisquare',
|
||||
'rankdata',
|
||||
'combine_pvalues', 'wasserstein_distance', 'energy_distance',
|
||||
'brunnermunzel', 'alexandergovern', 'distributions',
|
||||
'mstats_basic',
|
||||
]
|
||||
|
||||
|
||||
def __dir__():
|
||||
return __all__
|
||||
|
||||
|
||||
def __getattr__(name):
|
||||
return _sub_module_deprecation(sub_package="stats", module="stats",
|
||||
private_modules=["_stats_py", "_mgc"], all=__all__,
|
||||
attribute=name)
|
||||
@ -0,0 +1,354 @@
|
||||
import pickle
|
||||
|
||||
import numpy as np
|
||||
import numpy.testing as npt
|
||||
from numpy.testing import assert_allclose, assert_equal
|
||||
from pytest import raises as assert_raises
|
||||
|
||||
import numpy.ma.testutils as ma_npt
|
||||
|
||||
from scipy._lib._util import (
|
||||
getfullargspec_no_self as _getfullargspec, np_long
|
||||
)
|
||||
from scipy._lib._array_api import xp_assert_equal
|
||||
from scipy import stats
|
||||
|
||||
|
||||
def check_named_results(res, attributes, ma=False, xp=None):
|
||||
for i, attr in enumerate(attributes):
|
||||
if ma:
|
||||
ma_npt.assert_equal(res[i], getattr(res, attr))
|
||||
elif xp is not None:
|
||||
xp_assert_equal(res[i], getattr(res, attr))
|
||||
else:
|
||||
npt.assert_equal(res[i], getattr(res, attr))
|
||||
|
||||
|
||||
def check_normalization(distfn, args, distname):
|
||||
norm_moment = distfn.moment(0, *args)
|
||||
npt.assert_allclose(norm_moment, 1.0)
|
||||
|
||||
if distname == "rv_histogram_instance":
|
||||
atol, rtol = 1e-5, 0
|
||||
else:
|
||||
atol, rtol = 1e-7, 1e-7
|
||||
|
||||
normalization_expect = distfn.expect(lambda x: 1, args=args)
|
||||
npt.assert_allclose(normalization_expect, 1.0, atol=atol, rtol=rtol,
|
||||
err_msg=distname, verbose=True)
|
||||
|
||||
_a, _b = distfn.support(*args)
|
||||
normalization_cdf = distfn.cdf(_b, *args)
|
||||
npt.assert_allclose(normalization_cdf, 1.0)
|
||||
|
||||
|
||||
def check_moment(distfn, arg, m, v, msg):
|
||||
m1 = distfn.moment(1, *arg)
|
||||
m2 = distfn.moment(2, *arg)
|
||||
if not np.isinf(m):
|
||||
npt.assert_almost_equal(m1, m, decimal=10,
|
||||
err_msg=msg + ' - 1st moment')
|
||||
else: # or np.isnan(m1),
|
||||
npt.assert_(np.isinf(m1),
|
||||
msg + ' - 1st moment -infinite, m1=%s' % str(m1))
|
||||
|
||||
if not np.isinf(v):
|
||||
npt.assert_almost_equal(m2 - m1 * m1, v, decimal=10,
|
||||
err_msg=msg + ' - 2ndt moment')
|
||||
else: # or np.isnan(m2),
|
||||
npt.assert_(np.isinf(m2), msg + f' - 2nd moment -infinite, {m2=}')
|
||||
|
||||
|
||||
def check_mean_expect(distfn, arg, m, msg):
|
||||
if np.isfinite(m):
|
||||
m1 = distfn.expect(lambda x: x, arg)
|
||||
npt.assert_almost_equal(m1, m, decimal=5,
|
||||
err_msg=msg + ' - 1st moment (expect)')
|
||||
|
||||
|
||||
def check_var_expect(distfn, arg, m, v, msg):
|
||||
dist_looser_tolerances = {"rv_histogram_instance" , "ksone"}
|
||||
kwargs = {'rtol': 5e-6} if msg in dist_looser_tolerances else {}
|
||||
if np.isfinite(v):
|
||||
m2 = distfn.expect(lambda x: x*x, arg)
|
||||
npt.assert_allclose(m2, v + m*m, **kwargs)
|
||||
|
||||
|
||||
def check_skew_expect(distfn, arg, m, v, s, msg):
|
||||
if np.isfinite(s):
|
||||
m3e = distfn.expect(lambda x: np.power(x-m, 3), arg)
|
||||
npt.assert_almost_equal(m3e, s * np.power(v, 1.5),
|
||||
decimal=5, err_msg=msg + ' - skew')
|
||||
else:
|
||||
npt.assert_(np.isnan(s))
|
||||
|
||||
|
||||
def check_kurt_expect(distfn, arg, m, v, k, msg):
|
||||
if np.isfinite(k):
|
||||
m4e = distfn.expect(lambda x: np.power(x-m, 4), arg)
|
||||
npt.assert_allclose(m4e, (k + 3.) * np.power(v, 2),
|
||||
atol=1e-5, rtol=1e-5,
|
||||
err_msg=msg + ' - kurtosis')
|
||||
elif not np.isposinf(k):
|
||||
npt.assert_(np.isnan(k))
|
||||
|
||||
|
||||
def check_munp_expect(dist, args, msg):
|
||||
# If _munp is overridden, test a higher moment. (Before gh-18634, some
|
||||
# distributions had issues with moments 5 and higher.)
|
||||
if dist._munp.__func__ != stats.rv_continuous._munp:
|
||||
res = dist.moment(5, *args) # shouldn't raise an error
|
||||
ref = dist.expect(lambda x: x ** 5, args, lb=-np.inf, ub=np.inf)
|
||||
if not np.isfinite(res): # could be valid; automated test can't know
|
||||
return
|
||||
# loose tolerance, mostly to see whether _munp returns *something*
|
||||
assert_allclose(res, ref, atol=1e-10, rtol=1e-4,
|
||||
err_msg=msg + ' - higher moment / _munp')
|
||||
|
||||
|
||||
def check_entropy(distfn, arg, msg):
|
||||
ent = distfn.entropy(*arg)
|
||||
npt.assert_(not np.isnan(ent), msg + 'test Entropy is nan')
|
||||
|
||||
|
||||
def check_private_entropy(distfn, args, superclass):
|
||||
# compare a generic _entropy with the distribution-specific implementation
|
||||
npt.assert_allclose(distfn._entropy(*args),
|
||||
superclass._entropy(distfn, *args))
|
||||
|
||||
|
||||
def check_entropy_vect_scale(distfn, arg):
|
||||
# check 2-d
|
||||
sc = np.asarray([[1, 2], [3, 4]])
|
||||
v_ent = distfn.entropy(*arg, scale=sc)
|
||||
s_ent = [distfn.entropy(*arg, scale=s) for s in sc.ravel()]
|
||||
s_ent = np.asarray(s_ent).reshape(v_ent.shape)
|
||||
assert_allclose(v_ent, s_ent, atol=1e-14)
|
||||
|
||||
# check invalid value, check cast
|
||||
sc = [1, 2, -3]
|
||||
v_ent = distfn.entropy(*arg, scale=sc)
|
||||
s_ent = [distfn.entropy(*arg, scale=s) for s in sc]
|
||||
s_ent = np.asarray(s_ent).reshape(v_ent.shape)
|
||||
assert_allclose(v_ent, s_ent, atol=1e-14)
|
||||
|
||||
|
||||
def check_edge_support(distfn, args):
|
||||
# Make sure that x=self.a and self.b are handled correctly.
|
||||
x = distfn.support(*args)
|
||||
if isinstance(distfn, stats.rv_discrete):
|
||||
x = x[0]-1, x[1]
|
||||
|
||||
npt.assert_equal(distfn.cdf(x, *args), [0.0, 1.0])
|
||||
npt.assert_equal(distfn.sf(x, *args), [1.0, 0.0])
|
||||
|
||||
if distfn.name not in ('skellam', 'dlaplace'):
|
||||
# with a = -inf, log(0) generates warnings
|
||||
npt.assert_equal(distfn.logcdf(x, *args), [-np.inf, 0.0])
|
||||
npt.assert_equal(distfn.logsf(x, *args), [0.0, -np.inf])
|
||||
|
||||
npt.assert_equal(distfn.ppf([0.0, 1.0], *args), x)
|
||||
npt.assert_equal(distfn.isf([0.0, 1.0], *args), x[::-1])
|
||||
|
||||
# out-of-bounds for isf & ppf
|
||||
npt.assert_(np.isnan(distfn.isf([-1, 2], *args)).all())
|
||||
npt.assert_(np.isnan(distfn.ppf([-1, 2], *args)).all())
|
||||
|
||||
|
||||
def check_named_args(distfn, x, shape_args, defaults, meths):
|
||||
## Check calling w/ named arguments.
|
||||
|
||||
# check consistency of shapes, numargs and _parse signature
|
||||
signature = _getfullargspec(distfn._parse_args)
|
||||
npt.assert_(signature.varargs is None)
|
||||
npt.assert_(signature.varkw is None)
|
||||
npt.assert_(not signature.kwonlyargs)
|
||||
npt.assert_(list(signature.defaults) == list(defaults))
|
||||
|
||||
shape_argnames = signature.args[:-len(defaults)] # a, b, loc=0, scale=1
|
||||
if distfn.shapes:
|
||||
shapes_ = distfn.shapes.replace(',', ' ').split()
|
||||
else:
|
||||
shapes_ = ''
|
||||
npt.assert_(len(shapes_) == distfn.numargs)
|
||||
npt.assert_(len(shapes_) == len(shape_argnames))
|
||||
|
||||
# check calling w/ named arguments
|
||||
shape_args = list(shape_args)
|
||||
|
||||
vals = [meth(x, *shape_args) for meth in meths]
|
||||
npt.assert_(np.all(np.isfinite(vals)))
|
||||
|
||||
names, a, k = shape_argnames[:], shape_args[:], {}
|
||||
while names:
|
||||
k.update({names.pop(): a.pop()})
|
||||
v = [meth(x, *a, **k) for meth in meths]
|
||||
npt.assert_array_equal(vals, v)
|
||||
if 'n' not in k.keys():
|
||||
# `n` is first parameter of moment(), so can't be used as named arg
|
||||
npt.assert_equal(distfn.moment(1, *a, **k),
|
||||
distfn.moment(1, *shape_args))
|
||||
|
||||
# unknown arguments should not go through:
|
||||
k.update({'kaboom': 42})
|
||||
assert_raises(TypeError, distfn.cdf, x, **k)
|
||||
|
||||
|
||||
def check_random_state_property(distfn, args):
|
||||
# check the random_state attribute of a distribution *instance*
|
||||
|
||||
# This test fiddles with distfn.random_state. This breaks other tests,
|
||||
# hence need to save it and then restore.
|
||||
rndm = distfn.random_state
|
||||
|
||||
# baseline: this relies on the global state
|
||||
np.random.seed(1234)
|
||||
distfn.random_state = None
|
||||
r0 = distfn.rvs(*args, size=8)
|
||||
|
||||
# use an explicit instance-level random_state
|
||||
distfn.random_state = 1234
|
||||
r1 = distfn.rvs(*args, size=8)
|
||||
npt.assert_equal(r0, r1)
|
||||
|
||||
distfn.random_state = np.random.RandomState(1234)
|
||||
r2 = distfn.rvs(*args, size=8)
|
||||
npt.assert_equal(r0, r2)
|
||||
|
||||
# check that np.random.Generator can be used (numpy >= 1.17)
|
||||
if hasattr(np.random, 'default_rng'):
|
||||
# obtain a np.random.Generator object
|
||||
rng = np.random.default_rng(1234)
|
||||
distfn.rvs(*args, size=1, random_state=rng)
|
||||
|
||||
# can override the instance-level random_state for an individual .rvs call
|
||||
distfn.random_state = 2
|
||||
orig_state = distfn.random_state.get_state()
|
||||
|
||||
r3 = distfn.rvs(*args, size=8, random_state=np.random.RandomState(1234))
|
||||
npt.assert_equal(r0, r3)
|
||||
|
||||
# ... and that does not alter the instance-level random_state!
|
||||
npt.assert_equal(distfn.random_state.get_state(), orig_state)
|
||||
|
||||
# finally, restore the random_state
|
||||
distfn.random_state = rndm
|
||||
|
||||
|
||||
def check_meth_dtype(distfn, arg, meths):
|
||||
q0 = [0.25, 0.5, 0.75]
|
||||
x0 = distfn.ppf(q0, *arg)
|
||||
x_cast = [x0.astype(tp) for tp in (np_long, np.float16, np.float32,
|
||||
np.float64)]
|
||||
|
||||
for x in x_cast:
|
||||
# casting may have clipped the values, exclude those
|
||||
distfn._argcheck(*arg)
|
||||
x = x[(distfn.a < x) & (x < distfn.b)]
|
||||
for meth in meths:
|
||||
val = meth(x, *arg)
|
||||
npt.assert_(val.dtype == np.float64)
|
||||
|
||||
|
||||
def check_ppf_dtype(distfn, arg):
|
||||
q0 = np.asarray([0.25, 0.5, 0.75])
|
||||
q_cast = [q0.astype(tp) for tp in (np.float16, np.float32, np.float64)]
|
||||
for q in q_cast:
|
||||
for meth in [distfn.ppf, distfn.isf]:
|
||||
val = meth(q, *arg)
|
||||
npt.assert_(val.dtype == np.float64)
|
||||
|
||||
|
||||
def check_cmplx_deriv(distfn, arg):
|
||||
# Distributions allow complex arguments.
|
||||
def deriv(f, x, *arg):
|
||||
x = np.asarray(x)
|
||||
h = 1e-10
|
||||
return (f(x + h*1j, *arg)/h).imag
|
||||
|
||||
x0 = distfn.ppf([0.25, 0.51, 0.75], *arg)
|
||||
x_cast = [x0.astype(tp) for tp in (np_long, np.float16, np.float32,
|
||||
np.float64)]
|
||||
|
||||
for x in x_cast:
|
||||
# casting may have clipped the values, exclude those
|
||||
distfn._argcheck(*arg)
|
||||
x = x[(distfn.a < x) & (x < distfn.b)]
|
||||
|
||||
pdf, cdf, sf = distfn.pdf(x, *arg), distfn.cdf(x, *arg), distfn.sf(x, *arg)
|
||||
assert_allclose(deriv(distfn.cdf, x, *arg), pdf, rtol=1e-5)
|
||||
assert_allclose(deriv(distfn.logcdf, x, *arg), pdf/cdf, rtol=1e-5)
|
||||
|
||||
assert_allclose(deriv(distfn.sf, x, *arg), -pdf, rtol=1e-5)
|
||||
assert_allclose(deriv(distfn.logsf, x, *arg), -pdf/sf, rtol=1e-5)
|
||||
|
||||
assert_allclose(deriv(distfn.logpdf, x, *arg),
|
||||
deriv(distfn.pdf, x, *arg) / distfn.pdf(x, *arg),
|
||||
rtol=1e-5)
|
||||
|
||||
|
||||
def check_pickling(distfn, args):
|
||||
# check that a distribution instance pickles and unpickles
|
||||
# pay special attention to the random_state property
|
||||
|
||||
# save the random_state (restore later)
|
||||
rndm = distfn.random_state
|
||||
|
||||
# check unfrozen
|
||||
distfn.random_state = 1234
|
||||
distfn.rvs(*args, size=8)
|
||||
s = pickle.dumps(distfn)
|
||||
r0 = distfn.rvs(*args, size=8)
|
||||
|
||||
unpickled = pickle.loads(s)
|
||||
r1 = unpickled.rvs(*args, size=8)
|
||||
npt.assert_equal(r0, r1)
|
||||
|
||||
# also smoke test some methods
|
||||
medians = [distfn.ppf(0.5, *args), unpickled.ppf(0.5, *args)]
|
||||
npt.assert_equal(medians[0], medians[1])
|
||||
npt.assert_equal(distfn.cdf(medians[0], *args),
|
||||
unpickled.cdf(medians[1], *args))
|
||||
|
||||
# check frozen pickling/unpickling with rvs
|
||||
frozen_dist = distfn(*args)
|
||||
pkl = pickle.dumps(frozen_dist)
|
||||
unpickled = pickle.loads(pkl)
|
||||
|
||||
r0 = frozen_dist.rvs(size=8)
|
||||
r1 = unpickled.rvs(size=8)
|
||||
npt.assert_equal(r0, r1)
|
||||
|
||||
# check pickling/unpickling of .fit method
|
||||
if hasattr(distfn, "fit"):
|
||||
fit_function = distfn.fit
|
||||
pickled_fit_function = pickle.dumps(fit_function)
|
||||
unpickled_fit_function = pickle.loads(pickled_fit_function)
|
||||
assert fit_function.__name__ == unpickled_fit_function.__name__ == "fit"
|
||||
|
||||
# restore the random_state
|
||||
distfn.random_state = rndm
|
||||
|
||||
|
||||
def check_freezing(distfn, args):
|
||||
# regression test for gh-11089: freezing a distribution fails
|
||||
# if loc and/or scale are specified
|
||||
if isinstance(distfn, stats.rv_continuous):
|
||||
locscale = {'loc': 1, 'scale': 2}
|
||||
else:
|
||||
locscale = {'loc': 1}
|
||||
|
||||
rv = distfn(*args, **locscale)
|
||||
assert rv.a == distfn(*args).a
|
||||
assert rv.b == distfn(*args).b
|
||||
|
||||
|
||||
def check_rvs_broadcast(distfunc, distname, allargs, shape, shape_only, otype):
|
||||
np.random.seed(123)
|
||||
sample = distfunc.rvs(*allargs)
|
||||
assert_equal(sample.shape, shape, "%s: rvs failed to broadcast" % distname)
|
||||
if not shape_only:
|
||||
rvs = np.vectorize(lambda *allargs: distfunc.rvs(*allargs), otypes=otype)
|
||||
np.random.seed(123)
|
||||
expected = rvs(*allargs)
|
||||
assert_allclose(sample, expected, rtol=1e-13)
|
||||
171
venv/lib/python3.12/site-packages/scipy/stats/tests/data/_mvt.py
Normal file
171
venv/lib/python3.12/site-packages/scipy/stats/tests/data/_mvt.py
Normal file
@ -0,0 +1,171 @@
|
||||
import math
|
||||
import numpy as np
|
||||
from scipy import special
|
||||
from scipy.stats._qmc import primes_from_2_to
|
||||
|
||||
|
||||
def _primes(n):
|
||||
# Defined to facilitate comparison between translation and source
|
||||
# In Matlab, primes(10.5) -> first four primes, primes(11.5) -> first five
|
||||
return primes_from_2_to(math.ceil(n))
|
||||
|
||||
|
||||
def _gaminv(a, b):
|
||||
# Defined to facilitate comparison between translation and source
|
||||
# Matlab's `gaminv` is like `special.gammaincinv` but args are reversed
|
||||
return special.gammaincinv(b, a)
|
||||
|
||||
|
||||
def _qsimvtv(m, nu, sigma, a, b, rng):
|
||||
"""Estimates the multivariate t CDF using randomized QMC
|
||||
|
||||
Parameters
|
||||
----------
|
||||
m : int
|
||||
The number of points
|
||||
nu : float
|
||||
Degrees of freedom
|
||||
sigma : ndarray
|
||||
A 2D positive semidefinite covariance matrix
|
||||
a : ndarray
|
||||
Lower integration limits
|
||||
b : ndarray
|
||||
Upper integration limits.
|
||||
rng : Generator
|
||||
Pseudorandom number generator
|
||||
|
||||
Returns
|
||||
-------
|
||||
p : float
|
||||
The estimated CDF.
|
||||
e : float
|
||||
An absolute error estimate.
|
||||
|
||||
"""
|
||||
# _qsimvtv is a Python translation of the Matlab function qsimvtv,
|
||||
# semicolons and all.
|
||||
#
|
||||
# This function uses an algorithm given in the paper
|
||||
# "Comparison of Methods for the Numerical Computation of
|
||||
# Multivariate t Probabilities", in
|
||||
# J. of Computational and Graphical Stat., 11(2002), pp. 950-971, by
|
||||
# Alan Genz and Frank Bretz
|
||||
#
|
||||
# The primary references for the numerical integration are
|
||||
# "On a Number-Theoretical Integration Method"
|
||||
# H. Niederreiter, Aequationes Mathematicae, 8(1972), pp. 304-11.
|
||||
# and
|
||||
# "Randomization of Number Theoretic Methods for Multiple Integration"
|
||||
# R. Cranley & T.N.L. Patterson, SIAM J Numer Anal, 13(1976), pp. 904-14.
|
||||
#
|
||||
# Alan Genz is the author of this function and following Matlab functions.
|
||||
# Alan Genz, WSU Math, PO Box 643113, Pullman, WA 99164-3113
|
||||
# Email : alangenz@wsu.edu
|
||||
#
|
||||
# Copyright (C) 2013, Alan Genz, All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided the following conditions are met:
|
||||
# 1. Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in
|
||||
# the documentation and/or other materials provided with the
|
||||
# distribution.
|
||||
# 3. The contributor name(s) may not be used to endorse or promote
|
||||
# products derived from this software without specific prior
|
||||
# written permission.
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
||||
# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
|
||||
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
# Initialization
|
||||
sn = max(1, math.sqrt(nu)); ch, az, bz = _chlrps(sigma, a/sn, b/sn)
|
||||
n = len(sigma); N = 10; P = math.ceil(m/N); on = np.ones(P); p = 0; e = 0
|
||||
ps = np.sqrt(_primes(5*n*math.log(n+4)/4)); q = ps[:, np.newaxis] # Richtmyer gens.
|
||||
|
||||
# Randomization loop for ns samples
|
||||
c = None; dc = None
|
||||
for S in range(N):
|
||||
vp = on.copy(); s = np.zeros((n, P))
|
||||
for i in range(n):
|
||||
x = np.abs(2*np.mod(q[i]*np.arange(1, P+1) + rng.random(), 1)-1) # periodizing transform
|
||||
if i == 0:
|
||||
r = on
|
||||
if nu > 0:
|
||||
r = np.sqrt(2*_gaminv(x, nu/2))
|
||||
else:
|
||||
y = _Phinv(c + x*dc)
|
||||
s[i:] += ch[i:, i-1:i] * y
|
||||
si = s[i, :]; c = on.copy(); ai = az[i]*r - si; d = on.copy(); bi = bz[i]*r - si
|
||||
c[ai <= -9] = 0; tl = abs(ai) < 9; c[tl] = _Phi(ai[tl])
|
||||
d[bi <= -9] = 0; tl = abs(bi) < 9; d[tl] = _Phi(bi[tl])
|
||||
dc = d - c; vp = vp * dc
|
||||
d = (np.mean(vp) - p)/(S + 1); p = p + d; e = (S - 1)*e/(S + 1) + d**2
|
||||
e = math.sqrt(e) # error estimate is 3 times std error with N samples.
|
||||
return p, e
|
||||
|
||||
|
||||
# Standard statistical normal distribution functions
|
||||
def _Phi(z):
|
||||
return special.ndtr(z)
|
||||
|
||||
|
||||
def _Phinv(p):
|
||||
return special.ndtri(p)
|
||||
|
||||
|
||||
def _chlrps(R, a, b):
|
||||
"""
|
||||
Computes permuted and scaled lower Cholesky factor c for R which may be
|
||||
singular, also permuting and scaling integration limit vectors a and b.
|
||||
"""
|
||||
ep = 1e-10 # singularity tolerance
|
||||
eps = np.finfo(R.dtype).eps
|
||||
|
||||
n = len(R); c = R.copy(); ap = a.copy(); bp = b.copy(); d = np.sqrt(np.maximum(np.diag(c), 0))
|
||||
for i in range(n):
|
||||
if d[i] > 0:
|
||||
c[:, i] /= d[i]; c[i, :] /= d[i]
|
||||
ap[i] /= d[i]; bp[i] /= d[i]
|
||||
y = np.zeros((n, 1)); sqtp = math.sqrt(2*math.pi)
|
||||
|
||||
for k in range(n):
|
||||
im = k; ckk = 0; dem = 1; s = 0
|
||||
for i in range(k, n):
|
||||
if c[i, i] > eps:
|
||||
cii = math.sqrt(max(c[i, i], 0))
|
||||
if i > 0: s = c[i, :k] @ y[:k]
|
||||
ai = (ap[i]-s)/cii; bi = (bp[i]-s)/cii; de = _Phi(bi)-_Phi(ai)
|
||||
if de <= dem:
|
||||
ckk = cii; dem = de; am = ai; bm = bi; im = i
|
||||
if im > k:
|
||||
ap[[im, k]] = ap[[k, im]]; bp[[im, k]] = bp[[k, im]]; c[im, im] = c[k, k]
|
||||
t = c[im, :k].copy(); c[im, :k] = c[k, :k]; c[k, :k] = t
|
||||
t = c[im+1:, im].copy(); c[im+1:, im] = c[im+1:, k]; c[im+1:, k] = t
|
||||
t = c[k+1:im, k].copy(); c[k+1:im, k] = c[im, k+1:im].T; c[im, k+1:im] = t.T
|
||||
if ckk > ep*(k+1):
|
||||
c[k, k] = ckk; c[k, k+1:] = 0
|
||||
for i in range(k+1, n):
|
||||
c[i, k] = c[i, k]/ckk; c[i, k+1:i+1] = c[i, k+1:i+1] - c[i, k]*c[k+1:i+1, k].T
|
||||
if abs(dem) > ep:
|
||||
y[k] = (np.exp(-am**2/2) - np.exp(-bm**2/2)) / (sqtp*dem)
|
||||
else:
|
||||
y[k] = (am + bm) / 2
|
||||
if am < -10:
|
||||
y[k] = bm
|
||||
elif bm > 10:
|
||||
y[k] = am
|
||||
c[k, :k+1] /= ckk; ap[k] /= ckk; bp[k] /= ckk
|
||||
else:
|
||||
c[k:, k] = 0; y[k] = (ap[k] + bp[k])/2
|
||||
pass
|
||||
return c, ap, bp
|
||||
@ -0,0 +1,607 @@
|
||||
# DO NOT EDIT THIS FILE!
|
||||
# This file was generated by the R script
|
||||
# generate_fisher_exact_results_from_r.R
|
||||
# The script was run with R version 3.6.2 (2019-12-12) at 2020-11-09 06:16:09
|
||||
|
||||
|
||||
from collections import namedtuple
|
||||
import numpy as np
|
||||
|
||||
|
||||
Inf = np.inf
|
||||
|
||||
Parameters = namedtuple('Parameters',
|
||||
['table', 'confidence_level', 'alternative'])
|
||||
RResults = namedtuple('RResults',
|
||||
['pvalue', 'conditional_odds_ratio',
|
||||
'conditional_odds_ratio_ci'])
|
||||
data = [
|
||||
(Parameters(table=[[100, 2], [1000, 5]],
|
||||
confidence_level=0.95,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=0.1300759363430016,
|
||||
conditional_odds_ratio=0.25055839934223,
|
||||
conditional_odds_ratio_ci=(0.04035202926536294,
|
||||
2.662846672960251))),
|
||||
(Parameters(table=[[2, 7], [8, 2]],
|
||||
confidence_level=0.95,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=0.02301413756522116,
|
||||
conditional_odds_ratio=0.0858623513573622,
|
||||
conditional_odds_ratio_ci=(0.004668988338943325,
|
||||
0.895792956493601))),
|
||||
(Parameters(table=[[5, 1], [10, 10]],
|
||||
confidence_level=0.95,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=0.1973244147157191,
|
||||
conditional_odds_ratio=4.725646047336587,
|
||||
conditional_odds_ratio_ci=(0.4153910882532168,
|
||||
259.2593661129417))),
|
||||
(Parameters(table=[[5, 15], [20, 20]],
|
||||
confidence_level=0.95,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=0.09580440012477633,
|
||||
conditional_odds_ratio=0.3394396617440851,
|
||||
conditional_odds_ratio_ci=(0.08056337526385809,
|
||||
1.22704788545557))),
|
||||
(Parameters(table=[[5, 16], [16, 25]],
|
||||
confidence_level=0.95,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=0.2697004098849359,
|
||||
conditional_odds_ratio=0.4937791394540491,
|
||||
conditional_odds_ratio_ci=(0.1176691231650079,
|
||||
1.787463657995973))),
|
||||
(Parameters(table=[[10, 5], [10, 1]],
|
||||
confidence_level=0.95,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=0.1973244147157192,
|
||||
conditional_odds_ratio=0.2116112781158479,
|
||||
conditional_odds_ratio_ci=(0.003857141267422399,
|
||||
2.407369893767229))),
|
||||
(Parameters(table=[[10, 5], [10, 0]],
|
||||
confidence_level=0.95,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=0.06126482213438735,
|
||||
conditional_odds_ratio=0,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
1.451643573543705))),
|
||||
(Parameters(table=[[5, 0], [1, 4]],
|
||||
confidence_level=0.95,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=0.04761904761904762,
|
||||
conditional_odds_ratio=Inf,
|
||||
conditional_odds_ratio_ci=(1.024822256141754,
|
||||
Inf))),
|
||||
(Parameters(table=[[0, 5], [1, 4]],
|
||||
confidence_level=0.95,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=1,
|
||||
conditional_odds_ratio=0,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
39.00054996869288))),
|
||||
(Parameters(table=[[5, 1], [0, 4]],
|
||||
confidence_level=0.95,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=0.04761904761904761,
|
||||
conditional_odds_ratio=Inf,
|
||||
conditional_odds_ratio_ci=(1.024822256141754,
|
||||
Inf))),
|
||||
(Parameters(table=[[0, 1], [3, 2]],
|
||||
confidence_level=0.95,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=1,
|
||||
conditional_odds_ratio=0,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
39.00054996869287))),
|
||||
(Parameters(table=[[200, 7], [8, 300]],
|
||||
confidence_level=0.95,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=2.005657880389071e-122,
|
||||
conditional_odds_ratio=977.7866978606228,
|
||||
conditional_odds_ratio_ci=(349.2595113327733,
|
||||
3630.382605689872))),
|
||||
(Parameters(table=[[28, 21], [6, 1957]],
|
||||
confidence_level=0.95,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=5.728437460831947e-44,
|
||||
conditional_odds_ratio=425.2403028434684,
|
||||
conditional_odds_ratio_ci=(152.4166024390096,
|
||||
1425.700792178893))),
|
||||
(Parameters(table=[[190, 800], [200, 900]],
|
||||
confidence_level=0.95,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=0.574111858126088,
|
||||
conditional_odds_ratio=1.068697577856801,
|
||||
conditional_odds_ratio_ci=(0.8520462587912048,
|
||||
1.340148950273938))),
|
||||
(Parameters(table=[[100, 2], [1000, 5]],
|
||||
confidence_level=0.99,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=0.1300759363430016,
|
||||
conditional_odds_ratio=0.25055839934223,
|
||||
conditional_odds_ratio_ci=(0.02502345007115455,
|
||||
6.304424772117853))),
|
||||
(Parameters(table=[[2, 7], [8, 2]],
|
||||
confidence_level=0.99,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=0.02301413756522116,
|
||||
conditional_odds_ratio=0.0858623513573622,
|
||||
conditional_odds_ratio_ci=(0.001923034001462487,
|
||||
1.53670836950172))),
|
||||
(Parameters(table=[[5, 1], [10, 10]],
|
||||
confidence_level=0.99,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=0.1973244147157191,
|
||||
conditional_odds_ratio=4.725646047336587,
|
||||
conditional_odds_ratio_ci=(0.2397970951413721,
|
||||
1291.342011095509))),
|
||||
(Parameters(table=[[5, 15], [20, 20]],
|
||||
confidence_level=0.99,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=0.09580440012477633,
|
||||
conditional_odds_ratio=0.3394396617440851,
|
||||
conditional_odds_ratio_ci=(0.05127576113762925,
|
||||
1.717176678806983))),
|
||||
(Parameters(table=[[5, 16], [16, 25]],
|
||||
confidence_level=0.99,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=0.2697004098849359,
|
||||
conditional_odds_ratio=0.4937791394540491,
|
||||
conditional_odds_ratio_ci=(0.07498546954483619,
|
||||
2.506969905199901))),
|
||||
(Parameters(table=[[10, 5], [10, 1]],
|
||||
confidence_level=0.99,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=0.1973244147157192,
|
||||
conditional_odds_ratio=0.2116112781158479,
|
||||
conditional_odds_ratio_ci=(0.0007743881879531337,
|
||||
4.170192301163831))),
|
||||
(Parameters(table=[[10, 5], [10, 0]],
|
||||
confidence_level=0.99,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=0.06126482213438735,
|
||||
conditional_odds_ratio=0,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
2.642491011905582))),
|
||||
(Parameters(table=[[5, 0], [1, 4]],
|
||||
confidence_level=0.99,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=0.04761904761904762,
|
||||
conditional_odds_ratio=Inf,
|
||||
conditional_odds_ratio_ci=(0.496935393325443,
|
||||
Inf))),
|
||||
(Parameters(table=[[0, 5], [1, 4]],
|
||||
confidence_level=0.99,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=1,
|
||||
conditional_odds_ratio=0,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
198.019801980198))),
|
||||
(Parameters(table=[[5, 1], [0, 4]],
|
||||
confidence_level=0.99,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=0.04761904761904761,
|
||||
conditional_odds_ratio=Inf,
|
||||
conditional_odds_ratio_ci=(0.496935393325443,
|
||||
Inf))),
|
||||
(Parameters(table=[[0, 1], [3, 2]],
|
||||
confidence_level=0.99,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=1,
|
||||
conditional_odds_ratio=0,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
198.019801980198))),
|
||||
(Parameters(table=[[200, 7], [8, 300]],
|
||||
confidence_level=0.99,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=2.005657880389071e-122,
|
||||
conditional_odds_ratio=977.7866978606228,
|
||||
conditional_odds_ratio_ci=(270.0334165523604,
|
||||
5461.333333326708))),
|
||||
(Parameters(table=[[28, 21], [6, 1957]],
|
||||
confidence_level=0.99,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=5.728437460831947e-44,
|
||||
conditional_odds_ratio=425.2403028434684,
|
||||
conditional_odds_ratio_ci=(116.7944750275836,
|
||||
1931.995993191814))),
|
||||
(Parameters(table=[[190, 800], [200, 900]],
|
||||
confidence_level=0.99,
|
||||
alternative='two.sided'),
|
||||
RResults(pvalue=0.574111858126088,
|
||||
conditional_odds_ratio=1.068697577856801,
|
||||
conditional_odds_ratio_ci=(0.7949398282935892,
|
||||
1.436229679394333))),
|
||||
(Parameters(table=[[100, 2], [1000, 5]],
|
||||
confidence_level=0.95,
|
||||
alternative='less'),
|
||||
RResults(pvalue=0.1300759363430016,
|
||||
conditional_odds_ratio=0.25055839934223,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
1.797867027270803))),
|
||||
(Parameters(table=[[2, 7], [8, 2]],
|
||||
confidence_level=0.95,
|
||||
alternative='less'),
|
||||
RResults(pvalue=0.0185217259520665,
|
||||
conditional_odds_ratio=0.0858623513573622,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
0.6785254803404526))),
|
||||
(Parameters(table=[[5, 1], [10, 10]],
|
||||
confidence_level=0.95,
|
||||
alternative='less'),
|
||||
RResults(pvalue=0.9782608695652173,
|
||||
conditional_odds_ratio=4.725646047336587,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
127.8497388102893))),
|
||||
(Parameters(table=[[5, 15], [20, 20]],
|
||||
confidence_level=0.95,
|
||||
alternative='less'),
|
||||
RResults(pvalue=0.05625775074399956,
|
||||
conditional_odds_ratio=0.3394396617440851,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
1.032332939718425))),
|
||||
(Parameters(table=[[5, 16], [16, 25]],
|
||||
confidence_level=0.95,
|
||||
alternative='less'),
|
||||
RResults(pvalue=0.1808979350599346,
|
||||
conditional_odds_ratio=0.4937791394540491,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
1.502407513296985))),
|
||||
(Parameters(table=[[10, 5], [10, 1]],
|
||||
confidence_level=0.95,
|
||||
alternative='less'),
|
||||
RResults(pvalue=0.1652173913043479,
|
||||
conditional_odds_ratio=0.2116112781158479,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
1.820421051562392))),
|
||||
(Parameters(table=[[10, 5], [10, 0]],
|
||||
confidence_level=0.95,
|
||||
alternative='less'),
|
||||
RResults(pvalue=0.0565217391304348,
|
||||
conditional_odds_ratio=0,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
1.06224603077045))),
|
||||
(Parameters(table=[[5, 0], [1, 4]],
|
||||
confidence_level=0.95,
|
||||
alternative='less'),
|
||||
RResults(pvalue=1,
|
||||
conditional_odds_ratio=Inf,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
Inf))),
|
||||
(Parameters(table=[[0, 5], [1, 4]],
|
||||
confidence_level=0.95,
|
||||
alternative='less'),
|
||||
RResults(pvalue=0.5,
|
||||
conditional_odds_ratio=0,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
19.00192394479939))),
|
||||
(Parameters(table=[[5, 1], [0, 4]],
|
||||
confidence_level=0.95,
|
||||
alternative='less'),
|
||||
RResults(pvalue=1,
|
||||
conditional_odds_ratio=Inf,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
Inf))),
|
||||
(Parameters(table=[[0, 1], [3, 2]],
|
||||
confidence_level=0.95,
|
||||
alternative='less'),
|
||||
RResults(pvalue=0.4999999999999999,
|
||||
conditional_odds_ratio=0,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
19.00192394479939))),
|
||||
(Parameters(table=[[200, 7], [8, 300]],
|
||||
confidence_level=0.95,
|
||||
alternative='less'),
|
||||
RResults(pvalue=1,
|
||||
conditional_odds_ratio=977.7866978606228,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
3045.460216525746))),
|
||||
(Parameters(table=[[28, 21], [6, 1957]],
|
||||
confidence_level=0.95,
|
||||
alternative='less'),
|
||||
RResults(pvalue=1,
|
||||
conditional_odds_ratio=425.2403028434684,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
1186.440170942579))),
|
||||
(Parameters(table=[[190, 800], [200, 900]],
|
||||
confidence_level=0.95,
|
||||
alternative='less'),
|
||||
RResults(pvalue=0.7416227010368963,
|
||||
conditional_odds_ratio=1.068697577856801,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
1.293551891610822))),
|
||||
(Parameters(table=[[100, 2], [1000, 5]],
|
||||
confidence_level=0.99,
|
||||
alternative='less'),
|
||||
RResults(pvalue=0.1300759363430016,
|
||||
conditional_odds_ratio=0.25055839934223,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
4.375946050832565))),
|
||||
(Parameters(table=[[2, 7], [8, 2]],
|
||||
confidence_level=0.99,
|
||||
alternative='less'),
|
||||
RResults(pvalue=0.0185217259520665,
|
||||
conditional_odds_ratio=0.0858623513573622,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
1.235282118191202))),
|
||||
(Parameters(table=[[5, 1], [10, 10]],
|
||||
confidence_level=0.99,
|
||||
alternative='less'),
|
||||
RResults(pvalue=0.9782608695652173,
|
||||
conditional_odds_ratio=4.725646047336587,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
657.2063583945989))),
|
||||
(Parameters(table=[[5, 15], [20, 20]],
|
||||
confidence_level=0.99,
|
||||
alternative='less'),
|
||||
RResults(pvalue=0.05625775074399956,
|
||||
conditional_odds_ratio=0.3394396617440851,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
1.498867660683128))),
|
||||
(Parameters(table=[[5, 16], [16, 25]],
|
||||
confidence_level=0.99,
|
||||
alternative='less'),
|
||||
RResults(pvalue=0.1808979350599346,
|
||||
conditional_odds_ratio=0.4937791394540491,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
2.186159386716762))),
|
||||
(Parameters(table=[[10, 5], [10, 1]],
|
||||
confidence_level=0.99,
|
||||
alternative='less'),
|
||||
RResults(pvalue=0.1652173913043479,
|
||||
conditional_odds_ratio=0.2116112781158479,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
3.335351451901569))),
|
||||
(Parameters(table=[[10, 5], [10, 0]],
|
||||
confidence_level=0.99,
|
||||
alternative='less'),
|
||||
RResults(pvalue=0.0565217391304348,
|
||||
conditional_odds_ratio=0,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
2.075407697450433))),
|
||||
(Parameters(table=[[5, 0], [1, 4]],
|
||||
confidence_level=0.99,
|
||||
alternative='less'),
|
||||
RResults(pvalue=1,
|
||||
conditional_odds_ratio=Inf,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
Inf))),
|
||||
(Parameters(table=[[0, 5], [1, 4]],
|
||||
confidence_level=0.99,
|
||||
alternative='less'),
|
||||
RResults(pvalue=0.5,
|
||||
conditional_odds_ratio=0,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
99.00009507969122))),
|
||||
(Parameters(table=[[5, 1], [0, 4]],
|
||||
confidence_level=0.99,
|
||||
alternative='less'),
|
||||
RResults(pvalue=1,
|
||||
conditional_odds_ratio=Inf,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
Inf))),
|
||||
(Parameters(table=[[0, 1], [3, 2]],
|
||||
confidence_level=0.99,
|
||||
alternative='less'),
|
||||
RResults(pvalue=0.4999999999999999,
|
||||
conditional_odds_ratio=0,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
99.00009507969123))),
|
||||
(Parameters(table=[[200, 7], [8, 300]],
|
||||
confidence_level=0.99,
|
||||
alternative='less'),
|
||||
RResults(pvalue=1,
|
||||
conditional_odds_ratio=977.7866978606228,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
4503.078257659934))),
|
||||
(Parameters(table=[[28, 21], [6, 1957]],
|
||||
confidence_level=0.99,
|
||||
alternative='less'),
|
||||
RResults(pvalue=1,
|
||||
conditional_odds_ratio=425.2403028434684,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
1811.766127544222))),
|
||||
(Parameters(table=[[190, 800], [200, 900]],
|
||||
confidence_level=0.99,
|
||||
alternative='less'),
|
||||
RResults(pvalue=0.7416227010368963,
|
||||
conditional_odds_ratio=1.068697577856801,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
1.396522811516685))),
|
||||
(Parameters(table=[[100, 2], [1000, 5]],
|
||||
confidence_level=0.95,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=0.979790445314723,
|
||||
conditional_odds_ratio=0.25055839934223,
|
||||
conditional_odds_ratio_ci=(0.05119649909830196,
|
||||
Inf))),
|
||||
(Parameters(table=[[2, 7], [8, 2]],
|
||||
confidence_level=0.95,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=0.9990149169715733,
|
||||
conditional_odds_ratio=0.0858623513573622,
|
||||
conditional_odds_ratio_ci=(0.007163749169069961,
|
||||
Inf))),
|
||||
(Parameters(table=[[5, 1], [10, 10]],
|
||||
confidence_level=0.95,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=0.1652173913043478,
|
||||
conditional_odds_ratio=4.725646047336587,
|
||||
conditional_odds_ratio_ci=(0.5493234651081089,
|
||||
Inf))),
|
||||
(Parameters(table=[[5, 15], [20, 20]],
|
||||
confidence_level=0.95,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=0.9849086665340765,
|
||||
conditional_odds_ratio=0.3394396617440851,
|
||||
conditional_odds_ratio_ci=(0.1003538933958604,
|
||||
Inf))),
|
||||
(Parameters(table=[[5, 16], [16, 25]],
|
||||
confidence_level=0.95,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=0.9330176609214881,
|
||||
conditional_odds_ratio=0.4937791394540491,
|
||||
conditional_odds_ratio_ci=(0.146507416280863,
|
||||
Inf))),
|
||||
(Parameters(table=[[10, 5], [10, 1]],
|
||||
confidence_level=0.95,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=0.9782608695652174,
|
||||
conditional_odds_ratio=0.2116112781158479,
|
||||
conditional_odds_ratio_ci=(0.007821681994077808,
|
||||
Inf))),
|
||||
(Parameters(table=[[10, 5], [10, 0]],
|
||||
confidence_level=0.95,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=1,
|
||||
conditional_odds_ratio=0,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
Inf))),
|
||||
(Parameters(table=[[5, 0], [1, 4]],
|
||||
confidence_level=0.95,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=0.02380952380952382,
|
||||
conditional_odds_ratio=Inf,
|
||||
conditional_odds_ratio_ci=(1.487678929918272,
|
||||
Inf))),
|
||||
(Parameters(table=[[0, 5], [1, 4]],
|
||||
confidence_level=0.95,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=1,
|
||||
conditional_odds_ratio=0,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
Inf))),
|
||||
(Parameters(table=[[5, 1], [0, 4]],
|
||||
confidence_level=0.95,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=0.0238095238095238,
|
||||
conditional_odds_ratio=Inf,
|
||||
conditional_odds_ratio_ci=(1.487678929918272,
|
||||
Inf))),
|
||||
(Parameters(table=[[0, 1], [3, 2]],
|
||||
confidence_level=0.95,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=1,
|
||||
conditional_odds_ratio=0,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
Inf))),
|
||||
(Parameters(table=[[200, 7], [8, 300]],
|
||||
confidence_level=0.95,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=2.005657880388915e-122,
|
||||
conditional_odds_ratio=977.7866978606228,
|
||||
conditional_odds_ratio_ci=(397.784359748113,
|
||||
Inf))),
|
||||
(Parameters(table=[[28, 21], [6, 1957]],
|
||||
confidence_level=0.95,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=5.728437460831983e-44,
|
||||
conditional_odds_ratio=425.2403028434684,
|
||||
conditional_odds_ratio_ci=(174.7148056880929,
|
||||
Inf))),
|
||||
(Parameters(table=[[190, 800], [200, 900]],
|
||||
confidence_level=0.95,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=0.2959825901308897,
|
||||
conditional_odds_ratio=1.068697577856801,
|
||||
conditional_odds_ratio_ci=(0.8828406663967776,
|
||||
Inf))),
|
||||
(Parameters(table=[[100, 2], [1000, 5]],
|
||||
confidence_level=0.99,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=0.979790445314723,
|
||||
conditional_odds_ratio=0.25055839934223,
|
||||
conditional_odds_ratio_ci=(0.03045407081240429,
|
||||
Inf))),
|
||||
(Parameters(table=[[2, 7], [8, 2]],
|
||||
confidence_level=0.99,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=0.9990149169715733,
|
||||
conditional_odds_ratio=0.0858623513573622,
|
||||
conditional_odds_ratio_ci=(0.002768053063547901,
|
||||
Inf))),
|
||||
(Parameters(table=[[5, 1], [10, 10]],
|
||||
confidence_level=0.99,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=0.1652173913043478,
|
||||
conditional_odds_ratio=4.725646047336587,
|
||||
conditional_odds_ratio_ci=(0.2998184792279909,
|
||||
Inf))),
|
||||
(Parameters(table=[[5, 15], [20, 20]],
|
||||
confidence_level=0.99,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=0.9849086665340765,
|
||||
conditional_odds_ratio=0.3394396617440851,
|
||||
conditional_odds_ratio_ci=(0.06180414342643172,
|
||||
Inf))),
|
||||
(Parameters(table=[[5, 16], [16, 25]],
|
||||
confidence_level=0.99,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=0.9330176609214881,
|
||||
conditional_odds_ratio=0.4937791394540491,
|
||||
conditional_odds_ratio_ci=(0.09037094010066403,
|
||||
Inf))),
|
||||
(Parameters(table=[[10, 5], [10, 1]],
|
||||
confidence_level=0.99,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=0.9782608695652174,
|
||||
conditional_odds_ratio=0.2116112781158479,
|
||||
conditional_odds_ratio_ci=(0.001521592095430679,
|
||||
Inf))),
|
||||
(Parameters(table=[[10, 5], [10, 0]],
|
||||
confidence_level=0.99,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=1,
|
||||
conditional_odds_ratio=0,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
Inf))),
|
||||
(Parameters(table=[[5, 0], [1, 4]],
|
||||
confidence_level=0.99,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=0.02380952380952382,
|
||||
conditional_odds_ratio=Inf,
|
||||
conditional_odds_ratio_ci=(0.6661157890359722,
|
||||
Inf))),
|
||||
(Parameters(table=[[0, 5], [1, 4]],
|
||||
confidence_level=0.99,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=1,
|
||||
conditional_odds_ratio=0,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
Inf))),
|
||||
(Parameters(table=[[5, 1], [0, 4]],
|
||||
confidence_level=0.99,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=0.0238095238095238,
|
||||
conditional_odds_ratio=Inf,
|
||||
conditional_odds_ratio_ci=(0.6661157890359725,
|
||||
Inf))),
|
||||
(Parameters(table=[[0, 1], [3, 2]],
|
||||
confidence_level=0.99,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=1,
|
||||
conditional_odds_ratio=0,
|
||||
conditional_odds_ratio_ci=(0,
|
||||
Inf))),
|
||||
(Parameters(table=[[200, 7], [8, 300]],
|
||||
confidence_level=0.99,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=2.005657880388915e-122,
|
||||
conditional_odds_ratio=977.7866978606228,
|
||||
conditional_odds_ratio_ci=(297.9619252357688,
|
||||
Inf))),
|
||||
(Parameters(table=[[28, 21], [6, 1957]],
|
||||
confidence_level=0.99,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=5.728437460831983e-44,
|
||||
conditional_odds_ratio=425.2403028434684,
|
||||
conditional_odds_ratio_ci=(130.3213490295859,
|
||||
Inf))),
|
||||
(Parameters(table=[[190, 800], [200, 900]],
|
||||
confidence_level=0.99,
|
||||
alternative='greater'),
|
||||
RResults(pvalue=0.2959825901308897,
|
||||
conditional_odds_ratio=1.068697577856801,
|
||||
conditional_odds_ratio_ci=(0.8176272148267533,
|
||||
Inf))),
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,108 @@
|
||||
NIST/ITL StRD
|
||||
Dataset Name: AtmWtAg (AtmWtAg.dat)
|
||||
|
||||
|
||||
File Format: ASCII
|
||||
Certified Values (lines 41 to 47)
|
||||
Data (lines 61 to 108)
|
||||
|
||||
|
||||
Procedure: Analysis of Variance
|
||||
|
||||
|
||||
Reference: Powell, L.J., Murphy, T.J. and Gramlich, J.W. (1982).
|
||||
"The Absolute Isotopic Abundance & Atomic Weight
|
||||
of a Reference Sample of Silver".
|
||||
NBS Journal of Research, 87, pp. 9-19.
|
||||
|
||||
|
||||
Data: 1 Factor
|
||||
2 Treatments
|
||||
24 Replicates/Cell
|
||||
48 Observations
|
||||
7 Constant Leading Digits
|
||||
Average Level of Difficulty
|
||||
Observed Data
|
||||
|
||||
|
||||
Model: 3 Parameters (mu, tau_1, tau_2)
|
||||
y_{ij} = mu + tau_i + epsilon_{ij}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Certified Values:
|
||||
|
||||
Source of Sums of Mean
|
||||
Variation df Squares Squares F Statistic
|
||||
|
||||
|
||||
Between Instrument 1 3.63834187500000E-09 3.63834187500000E-09 1.59467335677930E+01
|
||||
Within Instrument 46 1.04951729166667E-08 2.28155932971014E-10
|
||||
|
||||
Certified R-Squared 2.57426544538321E-01
|
||||
|
||||
Certified Residual
|
||||
Standard Deviation 1.51048314446410E-05
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Data: Instrument AgWt
|
||||
1 107.8681568
|
||||
1 107.8681465
|
||||
1 107.8681572
|
||||
1 107.8681785
|
||||
1 107.8681446
|
||||
1 107.8681903
|
||||
1 107.8681526
|
||||
1 107.8681494
|
||||
1 107.8681616
|
||||
1 107.8681587
|
||||
1 107.8681519
|
||||
1 107.8681486
|
||||
1 107.8681419
|
||||
1 107.8681569
|
||||
1 107.8681508
|
||||
1 107.8681672
|
||||
1 107.8681385
|
||||
1 107.8681518
|
||||
1 107.8681662
|
||||
1 107.8681424
|
||||
1 107.8681360
|
||||
1 107.8681333
|
||||
1 107.8681610
|
||||
1 107.8681477
|
||||
2 107.8681079
|
||||
2 107.8681344
|
||||
2 107.8681513
|
||||
2 107.8681197
|
||||
2 107.8681604
|
||||
2 107.8681385
|
||||
2 107.8681642
|
||||
2 107.8681365
|
||||
2 107.8681151
|
||||
2 107.8681082
|
||||
2 107.8681517
|
||||
2 107.8681448
|
||||
2 107.8681198
|
||||
2 107.8681482
|
||||
2 107.8681334
|
||||
2 107.8681609
|
||||
2 107.8681101
|
||||
2 107.8681512
|
||||
2 107.8681469
|
||||
2 107.8681360
|
||||
2 107.8681254
|
||||
2 107.8681261
|
||||
2 107.8681450
|
||||
2 107.8681368
|
||||
@ -0,0 +1,85 @@
|
||||
NIST/ITL StRD
|
||||
Dataset Name: SiRstv (SiRstv.dat)
|
||||
|
||||
|
||||
File Format: ASCII
|
||||
Certified Values (lines 41 to 47)
|
||||
Data (lines 61 to 85)
|
||||
|
||||
|
||||
Procedure: Analysis of Variance
|
||||
|
||||
|
||||
Reference: Ehrstein, James and Croarkin, M. Carroll.
|
||||
Unpublished NIST dataset.
|
||||
|
||||
|
||||
Data: 1 Factor
|
||||
5 Treatments
|
||||
5 Replicates/Cell
|
||||
25 Observations
|
||||
3 Constant Leading Digits
|
||||
Lower Level of Difficulty
|
||||
Observed Data
|
||||
|
||||
|
||||
Model: 6 Parameters (mu,tau_1, ... , tau_5)
|
||||
y_{ij} = mu + tau_i + epsilon_{ij}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Certified Values:
|
||||
|
||||
Source of Sums of Mean
|
||||
Variation df Squares Squares F Statistic
|
||||
|
||||
Between Instrument 4 5.11462616000000E-02 1.27865654000000E-02 1.18046237440255E+00
|
||||
Within Instrument 20 2.16636560000000E-01 1.08318280000000E-02
|
||||
|
||||
Certified R-Squared 1.90999039051129E-01
|
||||
|
||||
Certified Residual
|
||||
Standard Deviation 1.04076068334656E-01
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Data: Instrument Resistance
|
||||
1 196.3052
|
||||
1 196.1240
|
||||
1 196.1890
|
||||
1 196.2569
|
||||
1 196.3403
|
||||
2 196.3042
|
||||
2 196.3825
|
||||
2 196.1669
|
||||
2 196.3257
|
||||
2 196.0422
|
||||
3 196.1303
|
||||
3 196.2005
|
||||
3 196.2889
|
||||
3 196.0343
|
||||
3 196.1811
|
||||
4 196.2795
|
||||
4 196.1748
|
||||
4 196.1494
|
||||
4 196.1485
|
||||
4 195.9885
|
||||
5 196.2119
|
||||
5 196.1051
|
||||
5 196.1850
|
||||
5 196.0052
|
||||
5 196.2090
|
||||
@ -0,0 +1,249 @@
|
||||
NIST/ITL StRD
|
||||
Dataset Name: SmLs01 (SmLs01.dat)
|
||||
|
||||
|
||||
File Format: ASCII
|
||||
Certified Values (lines 41 to 47)
|
||||
Data (lines 61 to 249)
|
||||
|
||||
|
||||
Procedure: Analysis of Variance
|
||||
|
||||
|
||||
Reference: Simon, Stephen D. and Lesage, James P. (1989).
|
||||
"Assessing the Accuracy of ANOVA Calculations in
|
||||
Statistical Software".
|
||||
Computational Statistics & Data Analysis, 8, pp. 325-332.
|
||||
|
||||
|
||||
Data: 1 Factor
|
||||
9 Treatments
|
||||
21 Replicates/Cell
|
||||
189 Observations
|
||||
1 Constant Leading Digit
|
||||
Lower Level of Difficulty
|
||||
Generated Data
|
||||
|
||||
|
||||
Model: 10 Parameters (mu,tau_1, ... , tau_9)
|
||||
y_{ij} = mu + tau_i + epsilon_{ij}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Certified Values:
|
||||
|
||||
Source of Sums of Mean
|
||||
Variation df Squares Squares F Statistic
|
||||
|
||||
Between Treatment 8 1.68000000000000E+00 2.10000000000000E-01 2.10000000000000E+01
|
||||
Within Treatment 180 1.80000000000000E+00 1.00000000000000E-02
|
||||
|
||||
Certified R-Squared 4.82758620689655E-01
|
||||
|
||||
Certified Residual
|
||||
Standard Deviation 1.00000000000000E-01
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Data: Treatment Response
|
||||
1 1.4
|
||||
1 1.3
|
||||
1 1.5
|
||||
1 1.3
|
||||
1 1.5
|
||||
1 1.3
|
||||
1 1.5
|
||||
1 1.3
|
||||
1 1.5
|
||||
1 1.3
|
||||
1 1.5
|
||||
1 1.3
|
||||
1 1.5
|
||||
1 1.3
|
||||
1 1.5
|
||||
1 1.3
|
||||
1 1.5
|
||||
1 1.3
|
||||
1 1.5
|
||||
1 1.3
|
||||
1 1.5
|
||||
2 1.3
|
||||
2 1.2
|
||||
2 1.4
|
||||
2 1.2
|
||||
2 1.4
|
||||
2 1.2
|
||||
2 1.4
|
||||
2 1.2
|
||||
2 1.4
|
||||
2 1.2
|
||||
2 1.4
|
||||
2 1.2
|
||||
2 1.4
|
||||
2 1.2
|
||||
2 1.4
|
||||
2 1.2
|
||||
2 1.4
|
||||
2 1.2
|
||||
2 1.4
|
||||
2 1.2
|
||||
2 1.4
|
||||
3 1.5
|
||||
3 1.4
|
||||
3 1.6
|
||||
3 1.4
|
||||
3 1.6
|
||||
3 1.4
|
||||
3 1.6
|
||||
3 1.4
|
||||
3 1.6
|
||||
3 1.4
|
||||
3 1.6
|
||||
3 1.4
|
||||
3 1.6
|
||||
3 1.4
|
||||
3 1.6
|
||||
3 1.4
|
||||
3 1.6
|
||||
3 1.4
|
||||
3 1.6
|
||||
3 1.4
|
||||
3 1.6
|
||||
4 1.3
|
||||
4 1.2
|
||||
4 1.4
|
||||
4 1.2
|
||||
4 1.4
|
||||
4 1.2
|
||||
4 1.4
|
||||
4 1.2
|
||||
4 1.4
|
||||
4 1.2
|
||||
4 1.4
|
||||
4 1.2
|
||||
4 1.4
|
||||
4 1.2
|
||||
4 1.4
|
||||
4 1.2
|
||||
4 1.4
|
||||
4 1.2
|
||||
4 1.4
|
||||
4 1.2
|
||||
4 1.4
|
||||
5 1.5
|
||||
5 1.4
|
||||
5 1.6
|
||||
5 1.4
|
||||
5 1.6
|
||||
5 1.4
|
||||
5 1.6
|
||||
5 1.4
|
||||
5 1.6
|
||||
5 1.4
|
||||
5 1.6
|
||||
5 1.4
|
||||
5 1.6
|
||||
5 1.4
|
||||
5 1.6
|
||||
5 1.4
|
||||
5 1.6
|
||||
5 1.4
|
||||
5 1.6
|
||||
5 1.4
|
||||
5 1.6
|
||||
6 1.3
|
||||
6 1.2
|
||||
6 1.4
|
||||
6 1.2
|
||||
6 1.4
|
||||
6 1.2
|
||||
6 1.4
|
||||
6 1.2
|
||||
6 1.4
|
||||
6 1.2
|
||||
6 1.4
|
||||
6 1.2
|
||||
6 1.4
|
||||
6 1.2
|
||||
6 1.4
|
||||
6 1.2
|
||||
6 1.4
|
||||
6 1.2
|
||||
6 1.4
|
||||
6 1.2
|
||||
6 1.4
|
||||
7 1.5
|
||||
7 1.4
|
||||
7 1.6
|
||||
7 1.4
|
||||
7 1.6
|
||||
7 1.4
|
||||
7 1.6
|
||||
7 1.4
|
||||
7 1.6
|
||||
7 1.4
|
||||
7 1.6
|
||||
7 1.4
|
||||
7 1.6
|
||||
7 1.4
|
||||
7 1.6
|
||||
7 1.4
|
||||
7 1.6
|
||||
7 1.4
|
||||
7 1.6
|
||||
7 1.4
|
||||
7 1.6
|
||||
8 1.3
|
||||
8 1.2
|
||||
8 1.4
|
||||
8 1.2
|
||||
8 1.4
|
||||
8 1.2
|
||||
8 1.4
|
||||
8 1.2
|
||||
8 1.4
|
||||
8 1.2
|
||||
8 1.4
|
||||
8 1.2
|
||||
8 1.4
|
||||
8 1.2
|
||||
8 1.4
|
||||
8 1.2
|
||||
8 1.4
|
||||
8 1.2
|
||||
8 1.4
|
||||
8 1.2
|
||||
8 1.4
|
||||
9 1.5
|
||||
9 1.4
|
||||
9 1.6
|
||||
9 1.4
|
||||
9 1.6
|
||||
9 1.4
|
||||
9 1.6
|
||||
9 1.4
|
||||
9 1.6
|
||||
9 1.4
|
||||
9 1.6
|
||||
9 1.4
|
||||
9 1.6
|
||||
9 1.4
|
||||
9 1.6
|
||||
9 1.4
|
||||
9 1.6
|
||||
9 1.4
|
||||
9 1.6
|
||||
9 1.4
|
||||
9 1.6
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,249 @@
|
||||
NIST/ITL StRD
|
||||
Dataset Name: SmLs04 (SmLs04.dat)
|
||||
|
||||
|
||||
File Format: ASCII
|
||||
Certified Values (lines 41 to 47)
|
||||
Data (lines 61 to 249)
|
||||
|
||||
|
||||
Procedure: Analysis of Variance
|
||||
|
||||
|
||||
Reference: Simon, Stephen D. and Lesage, James P. (1989).
|
||||
"Assessing the Accuracy of ANOVA Calculations in
|
||||
Statistical Software".
|
||||
Computational Statistics & Data Analysis, 8, pp. 325-332.
|
||||
|
||||
|
||||
Data: 1 Factor
|
||||
9 Treatments
|
||||
21 Replicates/Cell
|
||||
189 Observations
|
||||
7 Constant Leading Digits
|
||||
Average Level of Difficulty
|
||||
Generated Data
|
||||
|
||||
|
||||
Model: 10 Parameters (mu,tau_1, ... , tau_9)
|
||||
y_{ij} = mu + tau_i + epsilon_{ij}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Certified Values:
|
||||
|
||||
Source of Sums of Mean
|
||||
Variation df Squares Squares F Statistic
|
||||
|
||||
Between Treatment 8 1.68000000000000E+00 2.10000000000000E-01 2.10000000000000E+01
|
||||
Within Treatment 180 1.80000000000000E+00 1.00000000000000E-02
|
||||
|
||||
Certified R-Squared 4.82758620689655E-01
|
||||
|
||||
Certified Residual
|
||||
Standard Deviation 1.00000000000000E-01
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Data: Treatment Response
|
||||
1 1000000.4
|
||||
1 1000000.3
|
||||
1 1000000.5
|
||||
1 1000000.3
|
||||
1 1000000.5
|
||||
1 1000000.3
|
||||
1 1000000.5
|
||||
1 1000000.3
|
||||
1 1000000.5
|
||||
1 1000000.3
|
||||
1 1000000.5
|
||||
1 1000000.3
|
||||
1 1000000.5
|
||||
1 1000000.3
|
||||
1 1000000.5
|
||||
1 1000000.3
|
||||
1 1000000.5
|
||||
1 1000000.3
|
||||
1 1000000.5
|
||||
1 1000000.3
|
||||
1 1000000.5
|
||||
2 1000000.3
|
||||
2 1000000.2
|
||||
2 1000000.4
|
||||
2 1000000.2
|
||||
2 1000000.4
|
||||
2 1000000.2
|
||||
2 1000000.4
|
||||
2 1000000.2
|
||||
2 1000000.4
|
||||
2 1000000.2
|
||||
2 1000000.4
|
||||
2 1000000.2
|
||||
2 1000000.4
|
||||
2 1000000.2
|
||||
2 1000000.4
|
||||
2 1000000.2
|
||||
2 1000000.4
|
||||
2 1000000.2
|
||||
2 1000000.4
|
||||
2 1000000.2
|
||||
2 1000000.4
|
||||
3 1000000.5
|
||||
3 1000000.4
|
||||
3 1000000.6
|
||||
3 1000000.4
|
||||
3 1000000.6
|
||||
3 1000000.4
|
||||
3 1000000.6
|
||||
3 1000000.4
|
||||
3 1000000.6
|
||||
3 1000000.4
|
||||
3 1000000.6
|
||||
3 1000000.4
|
||||
3 1000000.6
|
||||
3 1000000.4
|
||||
3 1000000.6
|
||||
3 1000000.4
|
||||
3 1000000.6
|
||||
3 1000000.4
|
||||
3 1000000.6
|
||||
3 1000000.4
|
||||
3 1000000.6
|
||||
4 1000000.3
|
||||
4 1000000.2
|
||||
4 1000000.4
|
||||
4 1000000.2
|
||||
4 1000000.4
|
||||
4 1000000.2
|
||||
4 1000000.4
|
||||
4 1000000.2
|
||||
4 1000000.4
|
||||
4 1000000.2
|
||||
4 1000000.4
|
||||
4 1000000.2
|
||||
4 1000000.4
|
||||
4 1000000.2
|
||||
4 1000000.4
|
||||
4 1000000.2
|
||||
4 1000000.4
|
||||
4 1000000.2
|
||||
4 1000000.4
|
||||
4 1000000.2
|
||||
4 1000000.4
|
||||
5 1000000.5
|
||||
5 1000000.4
|
||||
5 1000000.6
|
||||
5 1000000.4
|
||||
5 1000000.6
|
||||
5 1000000.4
|
||||
5 1000000.6
|
||||
5 1000000.4
|
||||
5 1000000.6
|
||||
5 1000000.4
|
||||
5 1000000.6
|
||||
5 1000000.4
|
||||
5 1000000.6
|
||||
5 1000000.4
|
||||
5 1000000.6
|
||||
5 1000000.4
|
||||
5 1000000.6
|
||||
5 1000000.4
|
||||
5 1000000.6
|
||||
5 1000000.4
|
||||
5 1000000.6
|
||||
6 1000000.3
|
||||
6 1000000.2
|
||||
6 1000000.4
|
||||
6 1000000.2
|
||||
6 1000000.4
|
||||
6 1000000.2
|
||||
6 1000000.4
|
||||
6 1000000.2
|
||||
6 1000000.4
|
||||
6 1000000.2
|
||||
6 1000000.4
|
||||
6 1000000.2
|
||||
6 1000000.4
|
||||
6 1000000.2
|
||||
6 1000000.4
|
||||
6 1000000.2
|
||||
6 1000000.4
|
||||
6 1000000.2
|
||||
6 1000000.4
|
||||
6 1000000.2
|
||||
6 1000000.4
|
||||
7 1000000.5
|
||||
7 1000000.4
|
||||
7 1000000.6
|
||||
7 1000000.4
|
||||
7 1000000.6
|
||||
7 1000000.4
|
||||
7 1000000.6
|
||||
7 1000000.4
|
||||
7 1000000.6
|
||||
7 1000000.4
|
||||
7 1000000.6
|
||||
7 1000000.4
|
||||
7 1000000.6
|
||||
7 1000000.4
|
||||
7 1000000.6
|
||||
7 1000000.4
|
||||
7 1000000.6
|
||||
7 1000000.4
|
||||
7 1000000.6
|
||||
7 1000000.4
|
||||
7 1000000.6
|
||||
8 1000000.3
|
||||
8 1000000.2
|
||||
8 1000000.4
|
||||
8 1000000.2
|
||||
8 1000000.4
|
||||
8 1000000.2
|
||||
8 1000000.4
|
||||
8 1000000.2
|
||||
8 1000000.4
|
||||
8 1000000.2
|
||||
8 1000000.4
|
||||
8 1000000.2
|
||||
8 1000000.4
|
||||
8 1000000.2
|
||||
8 1000000.4
|
||||
8 1000000.2
|
||||
8 1000000.4
|
||||
8 1000000.2
|
||||
8 1000000.4
|
||||
8 1000000.2
|
||||
8 1000000.4
|
||||
9 1000000.5
|
||||
9 1000000.4
|
||||
9 1000000.6
|
||||
9 1000000.4
|
||||
9 1000000.6
|
||||
9 1000000.4
|
||||
9 1000000.6
|
||||
9 1000000.4
|
||||
9 1000000.6
|
||||
9 1000000.4
|
||||
9 1000000.6
|
||||
9 1000000.4
|
||||
9 1000000.6
|
||||
9 1000000.4
|
||||
9 1000000.6
|
||||
9 1000000.4
|
||||
9 1000000.6
|
||||
9 1000000.4
|
||||
9 1000000.6
|
||||
9 1000000.4
|
||||
9 1000000.6
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,249 @@
|
||||
NIST/ITL StRD
|
||||
Dataset Name: SmLs07 (SmLs07.dat)
|
||||
|
||||
|
||||
File Format: ASCII
|
||||
Certified Values (lines 41 to 47)
|
||||
Data (lines 61 to 249)
|
||||
|
||||
|
||||
Procedure: Analysis of Variance
|
||||
|
||||
|
||||
Reference: Simon, Stephen D. and Lesage, James P. (1989).
|
||||
"Assessing the Accuracy of ANOVA Calculations in
|
||||
Statistical Software".
|
||||
Computational Statistics & Data Analysis, 8, pp. 325-332.
|
||||
|
||||
|
||||
Data: 1 Factor
|
||||
9 Treatments
|
||||
21 Replicates/Cell
|
||||
189 Observations
|
||||
13 Constant Leading Digits
|
||||
Higher Level of Difficulty
|
||||
Generated Data
|
||||
|
||||
|
||||
Model: 10 Parameters (mu,tau_1, ... , tau_9)
|
||||
y_{ij} = mu + tau_i + epsilon_{ij}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Certified Values:
|
||||
|
||||
Source of Sums of Mean
|
||||
Variation df Squares Squares F Statistic
|
||||
|
||||
Between Treatment 8 1.68000000000000E+00 2.10000000000000E-01 2.10000000000000E+01
|
||||
Within Treatment 180 1.80000000000000E+00 1.00000000000000E-02
|
||||
|
||||
Certified R-Squared 4.82758620689655E-01
|
||||
|
||||
Certified Residual
|
||||
Standard Deviation 1.00000000000000E-01
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Data: Treatment Response
|
||||
1 1000000000000.4
|
||||
1 1000000000000.3
|
||||
1 1000000000000.5
|
||||
1 1000000000000.3
|
||||
1 1000000000000.5
|
||||
1 1000000000000.3
|
||||
1 1000000000000.5
|
||||
1 1000000000000.3
|
||||
1 1000000000000.5
|
||||
1 1000000000000.3
|
||||
1 1000000000000.5
|
||||
1 1000000000000.3
|
||||
1 1000000000000.5
|
||||
1 1000000000000.3
|
||||
1 1000000000000.5
|
||||
1 1000000000000.3
|
||||
1 1000000000000.5
|
||||
1 1000000000000.3
|
||||
1 1000000000000.5
|
||||
1 1000000000000.3
|
||||
1 1000000000000.5
|
||||
2 1000000000000.3
|
||||
2 1000000000000.2
|
||||
2 1000000000000.4
|
||||
2 1000000000000.2
|
||||
2 1000000000000.4
|
||||
2 1000000000000.2
|
||||
2 1000000000000.4
|
||||
2 1000000000000.2
|
||||
2 1000000000000.4
|
||||
2 1000000000000.2
|
||||
2 1000000000000.4
|
||||
2 1000000000000.2
|
||||
2 1000000000000.4
|
||||
2 1000000000000.2
|
||||
2 1000000000000.4
|
||||
2 1000000000000.2
|
||||
2 1000000000000.4
|
||||
2 1000000000000.2
|
||||
2 1000000000000.4
|
||||
2 1000000000000.2
|
||||
2 1000000000000.4
|
||||
3 1000000000000.5
|
||||
3 1000000000000.4
|
||||
3 1000000000000.6
|
||||
3 1000000000000.4
|
||||
3 1000000000000.6
|
||||
3 1000000000000.4
|
||||
3 1000000000000.6
|
||||
3 1000000000000.4
|
||||
3 1000000000000.6
|
||||
3 1000000000000.4
|
||||
3 1000000000000.6
|
||||
3 1000000000000.4
|
||||
3 1000000000000.6
|
||||
3 1000000000000.4
|
||||
3 1000000000000.6
|
||||
3 1000000000000.4
|
||||
3 1000000000000.6
|
||||
3 1000000000000.4
|
||||
3 1000000000000.6
|
||||
3 1000000000000.4
|
||||
3 1000000000000.6
|
||||
4 1000000000000.3
|
||||
4 1000000000000.2
|
||||
4 1000000000000.4
|
||||
4 1000000000000.2
|
||||
4 1000000000000.4
|
||||
4 1000000000000.2
|
||||
4 1000000000000.4
|
||||
4 1000000000000.2
|
||||
4 1000000000000.4
|
||||
4 1000000000000.2
|
||||
4 1000000000000.4
|
||||
4 1000000000000.2
|
||||
4 1000000000000.4
|
||||
4 1000000000000.2
|
||||
4 1000000000000.4
|
||||
4 1000000000000.2
|
||||
4 1000000000000.4
|
||||
4 1000000000000.2
|
||||
4 1000000000000.4
|
||||
4 1000000000000.2
|
||||
4 1000000000000.4
|
||||
5 1000000000000.5
|
||||
5 1000000000000.4
|
||||
5 1000000000000.6
|
||||
5 1000000000000.4
|
||||
5 1000000000000.6
|
||||
5 1000000000000.4
|
||||
5 1000000000000.6
|
||||
5 1000000000000.4
|
||||
5 1000000000000.6
|
||||
5 1000000000000.4
|
||||
5 1000000000000.6
|
||||
5 1000000000000.4
|
||||
5 1000000000000.6
|
||||
5 1000000000000.4
|
||||
5 1000000000000.6
|
||||
5 1000000000000.4
|
||||
5 1000000000000.6
|
||||
5 1000000000000.4
|
||||
5 1000000000000.6
|
||||
5 1000000000000.4
|
||||
5 1000000000000.6
|
||||
6 1000000000000.3
|
||||
6 1000000000000.2
|
||||
6 1000000000000.4
|
||||
6 1000000000000.2
|
||||
6 1000000000000.4
|
||||
6 1000000000000.2
|
||||
6 1000000000000.4
|
||||
6 1000000000000.2
|
||||
6 1000000000000.4
|
||||
6 1000000000000.2
|
||||
6 1000000000000.4
|
||||
6 1000000000000.2
|
||||
6 1000000000000.4
|
||||
6 1000000000000.2
|
||||
6 1000000000000.4
|
||||
6 1000000000000.2
|
||||
6 1000000000000.4
|
||||
6 1000000000000.2
|
||||
6 1000000000000.4
|
||||
6 1000000000000.2
|
||||
6 1000000000000.4
|
||||
7 1000000000000.5
|
||||
7 1000000000000.4
|
||||
7 1000000000000.6
|
||||
7 1000000000000.4
|
||||
7 1000000000000.6
|
||||
7 1000000000000.4
|
||||
7 1000000000000.6
|
||||
7 1000000000000.4
|
||||
7 1000000000000.6
|
||||
7 1000000000000.4
|
||||
7 1000000000000.6
|
||||
7 1000000000000.4
|
||||
7 1000000000000.6
|
||||
7 1000000000000.4
|
||||
7 1000000000000.6
|
||||
7 1000000000000.4
|
||||
7 1000000000000.6
|
||||
7 1000000000000.4
|
||||
7 1000000000000.6
|
||||
7 1000000000000.4
|
||||
7 1000000000000.6
|
||||
8 1000000000000.3
|
||||
8 1000000000000.2
|
||||
8 1000000000000.4
|
||||
8 1000000000000.2
|
||||
8 1000000000000.4
|
||||
8 1000000000000.2
|
||||
8 1000000000000.4
|
||||
8 1000000000000.2
|
||||
8 1000000000000.4
|
||||
8 1000000000000.2
|
||||
8 1000000000000.4
|
||||
8 1000000000000.2
|
||||
8 1000000000000.4
|
||||
8 1000000000000.2
|
||||
8 1000000000000.4
|
||||
8 1000000000000.2
|
||||
8 1000000000000.4
|
||||
8 1000000000000.2
|
||||
8 1000000000000.4
|
||||
8 1000000000000.2
|
||||
8 1000000000000.4
|
||||
9 1000000000000.5
|
||||
9 1000000000000.4
|
||||
9 1000000000000.6
|
||||
9 1000000000000.4
|
||||
9 1000000000000.6
|
||||
9 1000000000000.4
|
||||
9 1000000000000.6
|
||||
9 1000000000000.4
|
||||
9 1000000000000.6
|
||||
9 1000000000000.4
|
||||
9 1000000000000.6
|
||||
9 1000000000000.4
|
||||
9 1000000000000.6
|
||||
9 1000000000000.4
|
||||
9 1000000000000.6
|
||||
9 1000000000000.4
|
||||
9 1000000000000.6
|
||||
9 1000000000000.4
|
||||
9 1000000000000.6
|
||||
9 1000000000000.4
|
||||
9 1000000000000.6
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,97 @@
|
||||
NIST/ITL StRD
|
||||
Dataset Name: Norris (Norris.dat)
|
||||
|
||||
File Format: ASCII
|
||||
Certified Values (lines 31 to 46)
|
||||
Data (lines 61 to 96)
|
||||
|
||||
Procedure: Linear Least Squares Regression
|
||||
|
||||
Reference: Norris, J., NIST.
|
||||
Calibration of Ozone Monitors.
|
||||
|
||||
Data: 1 Response Variable (y)
|
||||
1 Predictor Variable (x)
|
||||
36 Observations
|
||||
Lower Level of Difficulty
|
||||
Observed Data
|
||||
|
||||
Model: Linear Class
|
||||
2 Parameters (B0,B1)
|
||||
|
||||
y = B0 + B1*x + e
|
||||
|
||||
|
||||
|
||||
Certified Regression Statistics
|
||||
|
||||
Standard Deviation
|
||||
Parameter Estimate of Estimate
|
||||
|
||||
B0 -0.262323073774029 0.232818234301152
|
||||
B1 1.00211681802045 0.429796848199937E-03
|
||||
|
||||
Residual
|
||||
Standard Deviation 0.884796396144373
|
||||
|
||||
R-Squared 0.999993745883712
|
||||
|
||||
|
||||
Certified Analysis of Variance Table
|
||||
|
||||
Source of Degrees of Sums of Mean
|
||||
Variation Freedom Squares Squares F Statistic
|
||||
|
||||
Regression 1 4255954.13232369 4255954.13232369 5436385.54079785
|
||||
Residual 34 26.6173985294224 0.782864662630069
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Data: y x
|
||||
0.1 0.2
|
||||
338.8 337.4
|
||||
118.1 118.2
|
||||
888.0 884.6
|
||||
9.2 10.1
|
||||
228.1 226.5
|
||||
668.5 666.3
|
||||
998.5 996.3
|
||||
449.1 448.6
|
||||
778.9 777.0
|
||||
559.2 558.2
|
||||
0.3 0.4
|
||||
0.1 0.6
|
||||
778.1 775.5
|
||||
668.8 666.9
|
||||
339.3 338.0
|
||||
448.9 447.5
|
||||
10.8 11.6
|
||||
557.7 556.0
|
||||
228.3 228.1
|
||||
998.0 995.8
|
||||
888.8 887.6
|
||||
119.6 120.2
|
||||
0.3 0.3
|
||||
0.6 0.3
|
||||
557.6 556.8
|
||||
339.3 339.1
|
||||
888.0 887.2
|
||||
998.5 999.0
|
||||
778.9 779.0
|
||||
10.2 11.1
|
||||
117.6 118.3
|
||||
228.9 229.2
|
||||
668.4 669.1
|
||||
449.2 448.9
|
||||
0.2 0.5
|
||||
|
||||
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,568 @@
|
||||
import numpy as np
|
||||
from numpy.testing import assert_allclose
|
||||
import pytest
|
||||
from pytest import raises as assert_raises
|
||||
from scipy.stats import (binned_statistic, binned_statistic_2d,
|
||||
binned_statistic_dd)
|
||||
from scipy._lib._util import check_random_state
|
||||
|
||||
from .common_tests import check_named_results
|
||||
|
||||
|
||||
class TestBinnedStatistic:
|
||||
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
rng = check_random_state(9865)
|
||||
cls.x = rng.uniform(size=100)
|
||||
cls.y = rng.uniform(size=100)
|
||||
cls.v = rng.uniform(size=100)
|
||||
cls.X = rng.uniform(size=(100, 3))
|
||||
cls.w = rng.uniform(size=100)
|
||||
cls.u = rng.uniform(size=100) + 1e6
|
||||
|
||||
def test_1d_count(self):
|
||||
x = self.x
|
||||
v = self.v
|
||||
|
||||
count1, edges1, bc = binned_statistic(x, v, 'count', bins=10)
|
||||
count2, edges2 = np.histogram(x, bins=10)
|
||||
|
||||
assert_allclose(count1, count2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_gh5927(self):
|
||||
# smoke test for gh5927 - binned_statistic was using `is` for string
|
||||
# comparison
|
||||
x = self.x
|
||||
v = self.v
|
||||
statistics = ['mean', 'median', 'count', 'sum']
|
||||
for statistic in statistics:
|
||||
binned_statistic(x, v, statistic, bins=10)
|
||||
|
||||
def test_big_number_std(self):
|
||||
# tests for numerical stability of std calculation
|
||||
# see issue gh-10126 for more
|
||||
x = self.x
|
||||
u = self.u
|
||||
stat1, edges1, bc = binned_statistic(x, u, 'std', bins=10)
|
||||
stat2, edges2, bc = binned_statistic(x, u, np.std, bins=10)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
|
||||
def test_empty_bins_std(self):
|
||||
# tests that std returns gives nan for empty bins
|
||||
x = self.x
|
||||
u = self.u
|
||||
print(binned_statistic(x, u, 'count', bins=1000))
|
||||
stat1, edges1, bc = binned_statistic(x, u, 'std', bins=1000)
|
||||
stat2, edges2, bc = binned_statistic(x, u, np.std, bins=1000)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
|
||||
def test_non_finite_inputs_and_int_bins(self):
|
||||
# if either `values` or `sample` contain np.inf or np.nan throw
|
||||
# see issue gh-9010 for more
|
||||
x = self.x
|
||||
u = self.u
|
||||
orig = u[0]
|
||||
u[0] = np.inf
|
||||
assert_raises(ValueError, binned_statistic, u, x, 'std', bins=10)
|
||||
# need to test for non-python specific ints, e.g. np.int8, np.int64
|
||||
assert_raises(ValueError, binned_statistic, u, x, 'std',
|
||||
bins=np.int64(10))
|
||||
u[0] = np.nan
|
||||
assert_raises(ValueError, binned_statistic, u, x, 'count', bins=10)
|
||||
# replace original value, u belongs the class
|
||||
u[0] = orig
|
||||
|
||||
def test_1d_result_attributes(self):
|
||||
x = self.x
|
||||
v = self.v
|
||||
|
||||
res = binned_statistic(x, v, 'count', bins=10)
|
||||
attributes = ('statistic', 'bin_edges', 'binnumber')
|
||||
check_named_results(res, attributes)
|
||||
|
||||
def test_1d_sum(self):
|
||||
x = self.x
|
||||
v = self.v
|
||||
|
||||
sum1, edges1, bc = binned_statistic(x, v, 'sum', bins=10)
|
||||
sum2, edges2 = np.histogram(x, bins=10, weights=v)
|
||||
|
||||
assert_allclose(sum1, sum2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_1d_mean(self):
|
||||
x = self.x
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic(x, v, 'mean', bins=10)
|
||||
stat2, edges2, bc = binned_statistic(x, v, np.mean, bins=10)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_1d_std(self):
|
||||
x = self.x
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic(x, v, 'std', bins=10)
|
||||
stat2, edges2, bc = binned_statistic(x, v, np.std, bins=10)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_1d_min(self):
|
||||
x = self.x
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic(x, v, 'min', bins=10)
|
||||
stat2, edges2, bc = binned_statistic(x, v, np.min, bins=10)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_1d_max(self):
|
||||
x = self.x
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic(x, v, 'max', bins=10)
|
||||
stat2, edges2, bc = binned_statistic(x, v, np.max, bins=10)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_1d_median(self):
|
||||
x = self.x
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic(x, v, 'median', bins=10)
|
||||
stat2, edges2, bc = binned_statistic(x, v, np.median, bins=10)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_1d_bincode(self):
|
||||
x = self.x[:20]
|
||||
v = self.v[:20]
|
||||
|
||||
count1, edges1, bc = binned_statistic(x, v, 'count', bins=3)
|
||||
bc2 = np.array([3, 2, 1, 3, 2, 3, 3, 3, 3, 1, 1, 3, 3, 1, 2, 3, 1,
|
||||
1, 2, 1])
|
||||
|
||||
bcount = [(bc == i).sum() for i in np.unique(bc)]
|
||||
|
||||
assert_allclose(bc, bc2)
|
||||
assert_allclose(bcount, count1)
|
||||
|
||||
def test_1d_range_keyword(self):
|
||||
# Regression test for gh-3063, range can be (min, max) or [(min, max)]
|
||||
np.random.seed(9865)
|
||||
x = np.arange(30)
|
||||
data = np.random.random(30)
|
||||
|
||||
mean, bins, _ = binned_statistic(x[:15], data[:15])
|
||||
mean_range, bins_range, _ = binned_statistic(x, data, range=[(0, 14)])
|
||||
mean_range2, bins_range2, _ = binned_statistic(x, data, range=(0, 14))
|
||||
|
||||
assert_allclose(mean, mean_range)
|
||||
assert_allclose(bins, bins_range)
|
||||
assert_allclose(mean, mean_range2)
|
||||
assert_allclose(bins, bins_range2)
|
||||
|
||||
def test_1d_multi_values(self):
|
||||
x = self.x
|
||||
v = self.v
|
||||
w = self.w
|
||||
|
||||
stat1v, edges1v, bc1v = binned_statistic(x, v, 'mean', bins=10)
|
||||
stat1w, edges1w, bc1w = binned_statistic(x, w, 'mean', bins=10)
|
||||
stat2, edges2, bc2 = binned_statistic(x, [v, w], 'mean', bins=10)
|
||||
|
||||
assert_allclose(stat2[0], stat1v)
|
||||
assert_allclose(stat2[1], stat1w)
|
||||
assert_allclose(edges1v, edges2)
|
||||
assert_allclose(bc1v, bc2)
|
||||
|
||||
def test_2d_count(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
|
||||
count1, binx1, biny1, bc = binned_statistic_2d(
|
||||
x, y, v, 'count', bins=5)
|
||||
count2, binx2, biny2 = np.histogram2d(x, y, bins=5)
|
||||
|
||||
assert_allclose(count1, count2)
|
||||
assert_allclose(binx1, binx2)
|
||||
assert_allclose(biny1, biny2)
|
||||
|
||||
def test_2d_result_attributes(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
|
||||
res = binned_statistic_2d(x, y, v, 'count', bins=5)
|
||||
attributes = ('statistic', 'x_edge', 'y_edge', 'binnumber')
|
||||
check_named_results(res, attributes)
|
||||
|
||||
def test_2d_sum(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
|
||||
sum1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'sum', bins=5)
|
||||
sum2, binx2, biny2 = np.histogram2d(x, y, bins=5, weights=v)
|
||||
|
||||
assert_allclose(sum1, sum2)
|
||||
assert_allclose(binx1, binx2)
|
||||
assert_allclose(biny1, biny2)
|
||||
|
||||
def test_2d_mean(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
|
||||
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'mean', bins=5)
|
||||
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.mean, bins=5)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(binx1, binx2)
|
||||
assert_allclose(biny1, biny2)
|
||||
|
||||
def test_2d_mean_unicode(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
stat1, binx1, biny1, bc = binned_statistic_2d(
|
||||
x, y, v, 'mean', bins=5)
|
||||
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.mean, bins=5)
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(binx1, binx2)
|
||||
assert_allclose(biny1, biny2)
|
||||
|
||||
def test_2d_std(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
|
||||
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'std', bins=5)
|
||||
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.std, bins=5)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(binx1, binx2)
|
||||
assert_allclose(biny1, biny2)
|
||||
|
||||
def test_2d_min(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
|
||||
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'min', bins=5)
|
||||
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.min, bins=5)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(binx1, binx2)
|
||||
assert_allclose(biny1, biny2)
|
||||
|
||||
def test_2d_max(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
|
||||
stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'max', bins=5)
|
||||
stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.max, bins=5)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(binx1, binx2)
|
||||
assert_allclose(biny1, biny2)
|
||||
|
||||
def test_2d_median(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
|
||||
stat1, binx1, biny1, bc = binned_statistic_2d(
|
||||
x, y, v, 'median', bins=5)
|
||||
stat2, binx2, biny2, bc = binned_statistic_2d(
|
||||
x, y, v, np.median, bins=5)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(binx1, binx2)
|
||||
assert_allclose(biny1, biny2)
|
||||
|
||||
def test_2d_bincode(self):
|
||||
x = self.x[:20]
|
||||
y = self.y[:20]
|
||||
v = self.v[:20]
|
||||
|
||||
count1, binx1, biny1, bc = binned_statistic_2d(
|
||||
x, y, v, 'count', bins=3)
|
||||
bc2 = np.array([17, 11, 6, 16, 11, 17, 18, 17, 17, 7, 6, 18, 16,
|
||||
6, 11, 16, 6, 6, 11, 8])
|
||||
|
||||
bcount = [(bc == i).sum() for i in np.unique(bc)]
|
||||
|
||||
assert_allclose(bc, bc2)
|
||||
count1adj = count1[count1.nonzero()]
|
||||
assert_allclose(bcount, count1adj)
|
||||
|
||||
def test_2d_multi_values(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
w = self.w
|
||||
|
||||
stat1v, binx1v, biny1v, bc1v = binned_statistic_2d(
|
||||
x, y, v, 'mean', bins=8)
|
||||
stat1w, binx1w, biny1w, bc1w = binned_statistic_2d(
|
||||
x, y, w, 'mean', bins=8)
|
||||
stat2, binx2, biny2, bc2 = binned_statistic_2d(
|
||||
x, y, [v, w], 'mean', bins=8)
|
||||
|
||||
assert_allclose(stat2[0], stat1v)
|
||||
assert_allclose(stat2[1], stat1w)
|
||||
assert_allclose(binx1v, binx2)
|
||||
assert_allclose(biny1w, biny2)
|
||||
assert_allclose(bc1v, bc2)
|
||||
|
||||
def test_2d_binnumbers_unraveled(self):
|
||||
x = self.x
|
||||
y = self.y
|
||||
v = self.v
|
||||
|
||||
stat, edgesx, bcx = binned_statistic(x, v, 'mean', bins=20)
|
||||
stat, edgesy, bcy = binned_statistic(y, v, 'mean', bins=10)
|
||||
|
||||
stat2, edgesx2, edgesy2, bc2 = binned_statistic_2d(
|
||||
x, y, v, 'mean', bins=(20, 10), expand_binnumbers=True)
|
||||
|
||||
bcx3 = np.searchsorted(edgesx, x, side='right')
|
||||
bcy3 = np.searchsorted(edgesy, y, side='right')
|
||||
|
||||
# `numpy.searchsorted` is non-inclusive on right-edge, compensate
|
||||
bcx3[x == x.max()] -= 1
|
||||
bcy3[y == y.max()] -= 1
|
||||
|
||||
assert_allclose(bcx, bc2[0])
|
||||
assert_allclose(bcy, bc2[1])
|
||||
assert_allclose(bcx3, bc2[0])
|
||||
assert_allclose(bcy3, bc2[1])
|
||||
|
||||
def test_dd_count(self):
|
||||
X = self.X
|
||||
v = self.v
|
||||
|
||||
count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3)
|
||||
count2, edges2 = np.histogramdd(X, bins=3)
|
||||
|
||||
assert_allclose(count1, count2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_dd_result_attributes(self):
|
||||
X = self.X
|
||||
v = self.v
|
||||
|
||||
res = binned_statistic_dd(X, v, 'count', bins=3)
|
||||
attributes = ('statistic', 'bin_edges', 'binnumber')
|
||||
check_named_results(res, attributes)
|
||||
|
||||
def test_dd_sum(self):
|
||||
X = self.X
|
||||
v = self.v
|
||||
|
||||
sum1, edges1, bc = binned_statistic_dd(X, v, 'sum', bins=3)
|
||||
sum2, edges2 = np.histogramdd(X, bins=3, weights=v)
|
||||
sum3, edges3, bc = binned_statistic_dd(X, v, np.sum, bins=3)
|
||||
|
||||
assert_allclose(sum1, sum2)
|
||||
assert_allclose(edges1, edges2)
|
||||
assert_allclose(sum1, sum3)
|
||||
assert_allclose(edges1, edges3)
|
||||
|
||||
def test_dd_mean(self):
|
||||
X = self.X
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic_dd(X, v, 'mean', bins=3)
|
||||
stat2, edges2, bc = binned_statistic_dd(X, v, np.mean, bins=3)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_dd_std(self):
|
||||
X = self.X
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic_dd(X, v, 'std', bins=3)
|
||||
stat2, edges2, bc = binned_statistic_dd(X, v, np.std, bins=3)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_dd_min(self):
|
||||
X = self.X
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic_dd(X, v, 'min', bins=3)
|
||||
stat2, edges2, bc = binned_statistic_dd(X, v, np.min, bins=3)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_dd_max(self):
|
||||
X = self.X
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic_dd(X, v, 'max', bins=3)
|
||||
stat2, edges2, bc = binned_statistic_dd(X, v, np.max, bins=3)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_dd_median(self):
|
||||
X = self.X
|
||||
v = self.v
|
||||
|
||||
stat1, edges1, bc = binned_statistic_dd(X, v, 'median', bins=3)
|
||||
stat2, edges2, bc = binned_statistic_dd(X, v, np.median, bins=3)
|
||||
|
||||
assert_allclose(stat1, stat2)
|
||||
assert_allclose(edges1, edges2)
|
||||
|
||||
def test_dd_bincode(self):
|
||||
X = self.X[:20]
|
||||
v = self.v[:20]
|
||||
|
||||
count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3)
|
||||
bc2 = np.array([63, 33, 86, 83, 88, 67, 57, 33, 42, 41, 82, 83, 92,
|
||||
32, 36, 91, 43, 87, 81, 81])
|
||||
|
||||
bcount = [(bc == i).sum() for i in np.unique(bc)]
|
||||
|
||||
assert_allclose(bc, bc2)
|
||||
count1adj = count1[count1.nonzero()]
|
||||
assert_allclose(bcount, count1adj)
|
||||
|
||||
def test_dd_multi_values(self):
|
||||
X = self.X
|
||||
v = self.v
|
||||
w = self.w
|
||||
|
||||
for stat in ["count", "sum", "mean", "std", "min", "max", "median",
|
||||
np.std]:
|
||||
stat1v, edges1v, bc1v = binned_statistic_dd(X, v, stat, bins=8)
|
||||
stat1w, edges1w, bc1w = binned_statistic_dd(X, w, stat, bins=8)
|
||||
stat2, edges2, bc2 = binned_statistic_dd(X, [v, w], stat, bins=8)
|
||||
assert_allclose(stat2[0], stat1v)
|
||||
assert_allclose(stat2[1], stat1w)
|
||||
assert_allclose(edges1v, edges2)
|
||||
assert_allclose(edges1w, edges2)
|
||||
assert_allclose(bc1v, bc2)
|
||||
|
||||
def test_dd_binnumbers_unraveled(self):
|
||||
X = self.X
|
||||
v = self.v
|
||||
|
||||
stat, edgesx, bcx = binned_statistic(X[:, 0], v, 'mean', bins=15)
|
||||
stat, edgesy, bcy = binned_statistic(X[:, 1], v, 'mean', bins=20)
|
||||
stat, edgesz, bcz = binned_statistic(X[:, 2], v, 'mean', bins=10)
|
||||
|
||||
stat2, edges2, bc2 = binned_statistic_dd(
|
||||
X, v, 'mean', bins=(15, 20, 10), expand_binnumbers=True)
|
||||
|
||||
assert_allclose(bcx, bc2[0])
|
||||
assert_allclose(bcy, bc2[1])
|
||||
assert_allclose(bcz, bc2[2])
|
||||
|
||||
def test_dd_binned_statistic_result(self):
|
||||
# NOTE: tests the reuse of bin_edges from previous call
|
||||
x = np.random.random((10000, 3))
|
||||
v = np.random.random(10000)
|
||||
bins = np.linspace(0, 1, 10)
|
||||
bins = (bins, bins, bins)
|
||||
|
||||
result = binned_statistic_dd(x, v, 'mean', bins=bins)
|
||||
stat = result.statistic
|
||||
|
||||
result = binned_statistic_dd(x, v, 'mean',
|
||||
binned_statistic_result=result)
|
||||
stat2 = result.statistic
|
||||
|
||||
assert_allclose(stat, stat2)
|
||||
|
||||
def test_dd_zero_dedges(self):
|
||||
x = np.random.random((10000, 3))
|
||||
v = np.random.random(10000)
|
||||
bins = np.linspace(0, 1, 10)
|
||||
bins = np.append(bins, 1)
|
||||
bins = (bins, bins, bins)
|
||||
with assert_raises(ValueError, match='difference is numerically 0'):
|
||||
binned_statistic_dd(x, v, 'mean', bins=bins)
|
||||
|
||||
def test_dd_range_errors(self):
|
||||
# Test that descriptive exceptions are raised as appropriate for bad
|
||||
# values of the `range` argument. (See gh-12996)
|
||||
with assert_raises(ValueError,
|
||||
match='In range, start must be <= stop'):
|
||||
binned_statistic_dd([self.y], self.v,
|
||||
range=[[1, 0]])
|
||||
with assert_raises(
|
||||
ValueError,
|
||||
match='In dimension 1 of range, start must be <= stop'):
|
||||
binned_statistic_dd([self.x, self.y], self.v,
|
||||
range=[[1, 0], [0, 1]])
|
||||
with assert_raises(
|
||||
ValueError,
|
||||
match='In dimension 2 of range, start must be <= stop'):
|
||||
binned_statistic_dd([self.x, self.y], self.v,
|
||||
range=[[0, 1], [1, 0]])
|
||||
with assert_raises(
|
||||
ValueError,
|
||||
match='range given for 1 dimensions; 2 required'):
|
||||
binned_statistic_dd([self.x, self.y], self.v,
|
||||
range=[[0, 1]])
|
||||
|
||||
def test_binned_statistic_float32(self):
|
||||
X = np.array([0, 0.42358226], dtype=np.float32)
|
||||
stat, _, _ = binned_statistic(X, None, 'count', bins=5)
|
||||
assert_allclose(stat, np.array([1, 0, 0, 0, 1], dtype=np.float64))
|
||||
|
||||
def test_gh14332(self):
|
||||
# Test the wrong output when the `sample` is close to bin edge
|
||||
x = []
|
||||
size = 20
|
||||
for i in range(size):
|
||||
x += [1-0.1**i]
|
||||
|
||||
bins = np.linspace(0,1,11)
|
||||
sum1, edges1, bc = binned_statistic_dd(x, np.ones(len(x)),
|
||||
bins=[bins], statistic='sum')
|
||||
sum2, edges2 = np.histogram(x, bins=bins)
|
||||
|
||||
assert_allclose(sum1, sum2)
|
||||
assert_allclose(edges1[0], edges2)
|
||||
|
||||
@pytest.mark.parametrize("dtype", [np.float64, np.complex128])
|
||||
@pytest.mark.parametrize("statistic", [np.mean, np.median, np.sum, np.std,
|
||||
np.min, np.max, 'count',
|
||||
lambda x: (x**2).sum(),
|
||||
lambda x: (x**2).sum() * 1j])
|
||||
def test_dd_all(self, dtype, statistic):
|
||||
def ref_statistic(x):
|
||||
return len(x) if statistic == 'count' else statistic(x)
|
||||
|
||||
rng = np.random.default_rng(3704743126639371)
|
||||
n = 10
|
||||
x = rng.random(size=n)
|
||||
i = x >= 0.5
|
||||
v = rng.random(size=n)
|
||||
if dtype is np.complex128:
|
||||
v = v + rng.random(size=n)*1j
|
||||
|
||||
stat, _, _ = binned_statistic_dd(x, v, statistic, bins=2)
|
||||
ref = np.array([ref_statistic(v[~i]), ref_statistic(v[i])])
|
||||
assert_allclose(stat, ref)
|
||||
assert stat.dtype == np.result_type(ref.dtype, np.float64)
|
||||
@ -0,0 +1,152 @@
|
||||
# Tests for the CensoredData class.
|
||||
|
||||
import pytest
|
||||
import numpy as np
|
||||
from numpy.testing import assert_equal, assert_array_equal
|
||||
from scipy.stats import CensoredData
|
||||
|
||||
|
||||
class TestCensoredData:
|
||||
|
||||
def test_basic(self):
|
||||
uncensored = [1]
|
||||
left = [0]
|
||||
right = [2, 5]
|
||||
interval = [[2, 3]]
|
||||
data = CensoredData(uncensored, left=left, right=right,
|
||||
interval=interval)
|
||||
assert_equal(data._uncensored, uncensored)
|
||||
assert_equal(data._left, left)
|
||||
assert_equal(data._right, right)
|
||||
assert_equal(data._interval, interval)
|
||||
|
||||
udata = data._uncensor()
|
||||
assert_equal(udata, np.concatenate((uncensored, left, right,
|
||||
np.mean(interval, axis=1))))
|
||||
|
||||
def test_right_censored(self):
|
||||
x = np.array([0, 3, 2.5])
|
||||
is_censored = np.array([0, 1, 0], dtype=bool)
|
||||
data = CensoredData.right_censored(x, is_censored)
|
||||
assert_equal(data._uncensored, x[~is_censored])
|
||||
assert_equal(data._right, x[is_censored])
|
||||
assert_equal(data._left, [])
|
||||
assert_equal(data._interval, np.empty((0, 2)))
|
||||
|
||||
def test_left_censored(self):
|
||||
x = np.array([0, 3, 2.5])
|
||||
is_censored = np.array([0, 1, 0], dtype=bool)
|
||||
data = CensoredData.left_censored(x, is_censored)
|
||||
assert_equal(data._uncensored, x[~is_censored])
|
||||
assert_equal(data._left, x[is_censored])
|
||||
assert_equal(data._right, [])
|
||||
assert_equal(data._interval, np.empty((0, 2)))
|
||||
|
||||
def test_interval_censored_basic(self):
|
||||
a = [0.5, 2.0, 3.0, 5.5]
|
||||
b = [1.0, 2.5, 3.5, 7.0]
|
||||
data = CensoredData.interval_censored(low=a, high=b)
|
||||
assert_array_equal(data._interval, np.array(list(zip(a, b))))
|
||||
assert data._uncensored.shape == (0,)
|
||||
assert data._left.shape == (0,)
|
||||
assert data._right.shape == (0,)
|
||||
|
||||
def test_interval_censored_mixed(self):
|
||||
# This is actually a mix of uncensored, left-censored, right-censored
|
||||
# and interval-censored data. Check that when the `interval_censored`
|
||||
# class method is used, the data is correctly separated into the
|
||||
# appropriate arrays.
|
||||
a = [0.5, -np.inf, -13.0, 2.0, 1.0, 10.0, -1.0]
|
||||
b = [0.5, 2500.0, np.inf, 3.0, 1.0, 11.0, np.inf]
|
||||
data = CensoredData.interval_censored(low=a, high=b)
|
||||
assert_array_equal(data._interval, [[2.0, 3.0], [10.0, 11.0]])
|
||||
assert_array_equal(data._uncensored, [0.5, 1.0])
|
||||
assert_array_equal(data._left, [2500.0])
|
||||
assert_array_equal(data._right, [-13.0, -1.0])
|
||||
|
||||
def test_interval_to_other_types(self):
|
||||
# The interval parameter can represent uncensored and
|
||||
# left- or right-censored data. Test the conversion of such
|
||||
# an example to the canonical form in which the different
|
||||
# types have been split into the separate arrays.
|
||||
interval = np.array([[0, 1], # interval-censored
|
||||
[2, 2], # not censored
|
||||
[3, 3], # not censored
|
||||
[9, np.inf], # right-censored
|
||||
[8, np.inf], # right-censored
|
||||
[-np.inf, 0], # left-censored
|
||||
[1, 2]]) # interval-censored
|
||||
data = CensoredData(interval=interval)
|
||||
assert_equal(data._uncensored, [2, 3])
|
||||
assert_equal(data._left, [0])
|
||||
assert_equal(data._right, [9, 8])
|
||||
assert_equal(data._interval, [[0, 1], [1, 2]])
|
||||
|
||||
def test_empty_arrays(self):
|
||||
data = CensoredData(uncensored=[], left=[], right=[], interval=[])
|
||||
assert data._uncensored.shape == (0,)
|
||||
assert data._left.shape == (0,)
|
||||
assert data._right.shape == (0,)
|
||||
assert data._interval.shape == (0, 2)
|
||||
assert len(data) == 0
|
||||
|
||||
def test_invalid_constructor_args(self):
|
||||
with pytest.raises(ValueError, match='must be a one-dimensional'):
|
||||
CensoredData(uncensored=[[1, 2, 3]])
|
||||
with pytest.raises(ValueError, match='must be a one-dimensional'):
|
||||
CensoredData(left=[[1, 2, 3]])
|
||||
with pytest.raises(ValueError, match='must be a one-dimensional'):
|
||||
CensoredData(right=[[1, 2, 3]])
|
||||
with pytest.raises(ValueError, match='must be a two-dimensional'):
|
||||
CensoredData(interval=[[1, 2, 3]])
|
||||
|
||||
with pytest.raises(ValueError, match='must not contain nan'):
|
||||
CensoredData(uncensored=[1, np.nan, 2])
|
||||
with pytest.raises(ValueError, match='must not contain nan'):
|
||||
CensoredData(left=[1, np.nan, 2])
|
||||
with pytest.raises(ValueError, match='must not contain nan'):
|
||||
CensoredData(right=[1, np.nan, 2])
|
||||
with pytest.raises(ValueError, match='must not contain nan'):
|
||||
CensoredData(interval=[[1, np.nan], [2, 3]])
|
||||
|
||||
with pytest.raises(ValueError,
|
||||
match='both values must not be infinite'):
|
||||
CensoredData(interval=[[1, 3], [2, 9], [np.inf, np.inf]])
|
||||
|
||||
with pytest.raises(ValueError,
|
||||
match='left value must not exceed the right'):
|
||||
CensoredData(interval=[[1, 0], [2, 2]])
|
||||
|
||||
@pytest.mark.parametrize('func', [CensoredData.left_censored,
|
||||
CensoredData.right_censored])
|
||||
def test_invalid_left_right_censored_args(self, func):
|
||||
with pytest.raises(ValueError,
|
||||
match='`x` must be one-dimensional'):
|
||||
func([[1, 2, 3]], [0, 1, 1])
|
||||
with pytest.raises(ValueError,
|
||||
match='`censored` must be one-dimensional'):
|
||||
func([1, 2, 3], [[0, 1, 1]])
|
||||
with pytest.raises(ValueError, match='`x` must not contain'):
|
||||
func([1, 2, np.nan], [0, 1, 1])
|
||||
with pytest.raises(ValueError, match='must have the same length'):
|
||||
func([1, 2, 3], [0, 0, 1, 1])
|
||||
|
||||
def test_invalid_censored_args(self):
|
||||
with pytest.raises(ValueError,
|
||||
match='`low` must be a one-dimensional'):
|
||||
CensoredData.interval_censored(low=[[3]], high=[4, 5])
|
||||
with pytest.raises(ValueError,
|
||||
match='`high` must be a one-dimensional'):
|
||||
CensoredData.interval_censored(low=[3], high=[[4, 5]])
|
||||
with pytest.raises(ValueError, match='`low` must not contain'):
|
||||
CensoredData.interval_censored([1, 2, np.nan], [0, 1, 1])
|
||||
with pytest.raises(ValueError, match='must have the same length'):
|
||||
CensoredData.interval_censored([1, 2, 3], [0, 0, 1, 1])
|
||||
|
||||
def test_count_censored(self):
|
||||
x = [1, 2, 3]
|
||||
# data1 has no censored data.
|
||||
data1 = CensoredData(x)
|
||||
assert data1.num_censored() == 0
|
||||
data2 = CensoredData(uncensored=[2.5], left=[10], interval=[[0, 1]])
|
||||
assert data2.num_censored() == 2
|
||||
@ -0,0 +1,241 @@
|
||||
import numpy as np
|
||||
from numpy.testing import (assert_equal, assert_array_equal,
|
||||
assert_array_almost_equal, assert_approx_equal,
|
||||
assert_allclose)
|
||||
import pytest
|
||||
from pytest import raises as assert_raises
|
||||
from scipy.special import xlogy
|
||||
from scipy.stats.contingency import (margins, expected_freq,
|
||||
chi2_contingency, association)
|
||||
|
||||
|
||||
def test_margins():
|
||||
a = np.array([1])
|
||||
m = margins(a)
|
||||
assert_equal(len(m), 1)
|
||||
m0 = m[0]
|
||||
assert_array_equal(m0, np.array([1]))
|
||||
|
||||
a = np.array([[1]])
|
||||
m0, m1 = margins(a)
|
||||
expected0 = np.array([[1]])
|
||||
expected1 = np.array([[1]])
|
||||
assert_array_equal(m0, expected0)
|
||||
assert_array_equal(m1, expected1)
|
||||
|
||||
a = np.arange(12).reshape(2, 6)
|
||||
m0, m1 = margins(a)
|
||||
expected0 = np.array([[15], [51]])
|
||||
expected1 = np.array([[6, 8, 10, 12, 14, 16]])
|
||||
assert_array_equal(m0, expected0)
|
||||
assert_array_equal(m1, expected1)
|
||||
|
||||
a = np.arange(24).reshape(2, 3, 4)
|
||||
m0, m1, m2 = margins(a)
|
||||
expected0 = np.array([[[66]], [[210]]])
|
||||
expected1 = np.array([[[60], [92], [124]]])
|
||||
expected2 = np.array([[[60, 66, 72, 78]]])
|
||||
assert_array_equal(m0, expected0)
|
||||
assert_array_equal(m1, expected1)
|
||||
assert_array_equal(m2, expected2)
|
||||
|
||||
|
||||
def test_expected_freq():
|
||||
assert_array_equal(expected_freq([1]), np.array([1.0]))
|
||||
|
||||
observed = np.array([[[2, 0], [0, 2]], [[0, 2], [2, 0]], [[1, 1], [1, 1]]])
|
||||
e = expected_freq(observed)
|
||||
assert_array_equal(e, np.ones_like(observed))
|
||||
|
||||
observed = np.array([[10, 10, 20], [20, 20, 20]])
|
||||
e = expected_freq(observed)
|
||||
correct = np.array([[12., 12., 16.], [18., 18., 24.]])
|
||||
assert_array_almost_equal(e, correct)
|
||||
|
||||
|
||||
def test_chi2_contingency_trivial():
|
||||
# Some very simple tests for chi2_contingency.
|
||||
|
||||
# A trivial case
|
||||
obs = np.array([[1, 2], [1, 2]])
|
||||
chi2, p, dof, expected = chi2_contingency(obs, correction=False)
|
||||
assert_equal(chi2, 0.0)
|
||||
assert_equal(p, 1.0)
|
||||
assert_equal(dof, 1)
|
||||
assert_array_equal(obs, expected)
|
||||
|
||||
# A *really* trivial case: 1-D data.
|
||||
obs = np.array([1, 2, 3])
|
||||
chi2, p, dof, expected = chi2_contingency(obs, correction=False)
|
||||
assert_equal(chi2, 0.0)
|
||||
assert_equal(p, 1.0)
|
||||
assert_equal(dof, 0)
|
||||
assert_array_equal(obs, expected)
|
||||
|
||||
|
||||
def test_chi2_contingency_R():
|
||||
# Some test cases that were computed independently, using R.
|
||||
|
||||
# Rcode = \
|
||||
# """
|
||||
# # Data vector.
|
||||
# data <- c(
|
||||
# 12, 34, 23, 4, 47, 11,
|
||||
# 35, 31, 11, 34, 10, 18,
|
||||
# 12, 32, 9, 18, 13, 19,
|
||||
# 12, 12, 14, 9, 33, 25
|
||||
# )
|
||||
#
|
||||
# # Create factor tags:r=rows, c=columns, t=tiers
|
||||
# r <- factor(gl(4, 2*3, 2*3*4, labels=c("r1", "r2", "r3", "r4")))
|
||||
# c <- factor(gl(3, 1, 2*3*4, labels=c("c1", "c2", "c3")))
|
||||
# t <- factor(gl(2, 3, 2*3*4, labels=c("t1", "t2")))
|
||||
#
|
||||
# # 3-way Chi squared test of independence
|
||||
# s = summary(xtabs(data~r+c+t))
|
||||
# print(s)
|
||||
# """
|
||||
# Routput = \
|
||||
# """
|
||||
# Call: xtabs(formula = data ~ r + c + t)
|
||||
# Number of cases in table: 478
|
||||
# Number of factors: 3
|
||||
# Test for independence of all factors:
|
||||
# Chisq = 102.17, df = 17, p-value = 3.514e-14
|
||||
# """
|
||||
obs = np.array(
|
||||
[[[12, 34, 23],
|
||||
[35, 31, 11],
|
||||
[12, 32, 9],
|
||||
[12, 12, 14]],
|
||||
[[4, 47, 11],
|
||||
[34, 10, 18],
|
||||
[18, 13, 19],
|
||||
[9, 33, 25]]])
|
||||
chi2, p, dof, expected = chi2_contingency(obs)
|
||||
assert_approx_equal(chi2, 102.17, significant=5)
|
||||
assert_approx_equal(p, 3.514e-14, significant=4)
|
||||
assert_equal(dof, 17)
|
||||
|
||||
# Rcode = \
|
||||
# """
|
||||
# # Data vector.
|
||||
# data <- c(
|
||||
# #
|
||||
# 12, 17,
|
||||
# 11, 16,
|
||||
# #
|
||||
# 11, 12,
|
||||
# 15, 16,
|
||||
# #
|
||||
# 23, 15,
|
||||
# 30, 22,
|
||||
# #
|
||||
# 14, 17,
|
||||
# 15, 16
|
||||
# )
|
||||
#
|
||||
# # Create factor tags:r=rows, c=columns, d=depths(?), t=tiers
|
||||
# r <- factor(gl(2, 2, 2*2*2*2, labels=c("r1", "r2")))
|
||||
# c <- factor(gl(2, 1, 2*2*2*2, labels=c("c1", "c2")))
|
||||
# d <- factor(gl(2, 4, 2*2*2*2, labels=c("d1", "d2")))
|
||||
# t <- factor(gl(2, 8, 2*2*2*2, labels=c("t1", "t2")))
|
||||
#
|
||||
# # 4-way Chi squared test of independence
|
||||
# s = summary(xtabs(data~r+c+d+t))
|
||||
# print(s)
|
||||
# """
|
||||
# Routput = \
|
||||
# """
|
||||
# Call: xtabs(formula = data ~ r + c + d + t)
|
||||
# Number of cases in table: 262
|
||||
# Number of factors: 4
|
||||
# Test for independence of all factors:
|
||||
# Chisq = 8.758, df = 11, p-value = 0.6442
|
||||
# """
|
||||
obs = np.array(
|
||||
[[[[12, 17],
|
||||
[11, 16]],
|
||||
[[11, 12],
|
||||
[15, 16]]],
|
||||
[[[23, 15],
|
||||
[30, 22]],
|
||||
[[14, 17],
|
||||
[15, 16]]]])
|
||||
chi2, p, dof, expected = chi2_contingency(obs)
|
||||
assert_approx_equal(chi2, 8.758, significant=4)
|
||||
assert_approx_equal(p, 0.6442, significant=4)
|
||||
assert_equal(dof, 11)
|
||||
|
||||
|
||||
def test_chi2_contingency_g():
|
||||
c = np.array([[15, 60], [15, 90]])
|
||||
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood',
|
||||
correction=False)
|
||||
assert_allclose(g, 2*xlogy(c, c/e).sum())
|
||||
|
||||
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood',
|
||||
correction=True)
|
||||
c_corr = c + np.array([[-0.5, 0.5], [0.5, -0.5]])
|
||||
assert_allclose(g, 2*xlogy(c_corr, c_corr/e).sum())
|
||||
|
||||
c = np.array([[10, 12, 10], [12, 10, 10]])
|
||||
g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood')
|
||||
assert_allclose(g, 2*xlogy(c, c/e).sum())
|
||||
|
||||
|
||||
def test_chi2_contingency_bad_args():
|
||||
# Test that "bad" inputs raise a ValueError.
|
||||
|
||||
# Negative value in the array of observed frequencies.
|
||||
obs = np.array([[-1, 10], [1, 2]])
|
||||
assert_raises(ValueError, chi2_contingency, obs)
|
||||
|
||||
# The zeros in this will result in zeros in the array
|
||||
# of expected frequencies.
|
||||
obs = np.array([[0, 1], [0, 1]])
|
||||
assert_raises(ValueError, chi2_contingency, obs)
|
||||
|
||||
# A degenerate case: `observed` has size 0.
|
||||
obs = np.empty((0, 8))
|
||||
assert_raises(ValueError, chi2_contingency, obs)
|
||||
|
||||
|
||||
def test_chi2_contingency_yates_gh13875():
|
||||
# Magnitude of Yates' continuity correction should not exceed difference
|
||||
# between expected and observed value of the statistic; see gh-13875
|
||||
observed = np.array([[1573, 3], [4, 0]])
|
||||
p = chi2_contingency(observed)[1]
|
||||
assert_allclose(p, 1, rtol=1e-12)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("correction", [False, True])
|
||||
def test_result(correction):
|
||||
obs = np.array([[1, 2], [1, 2]])
|
||||
res = chi2_contingency(obs, correction=correction)
|
||||
assert_equal((res.statistic, res.pvalue, res.dof, res.expected_freq), res)
|
||||
|
||||
|
||||
def test_bad_association_args():
|
||||
# Invalid Test Statistic
|
||||
assert_raises(ValueError, association, [[1, 2], [3, 4]], "X")
|
||||
# Invalid array shape
|
||||
assert_raises(ValueError, association, [[[1, 2]], [[3, 4]]], "cramer")
|
||||
# chi2_contingency exception
|
||||
assert_raises(ValueError, association, [[-1, 10], [1, 2]], 'cramer')
|
||||
# Invalid Array Item Data Type
|
||||
assert_raises(ValueError, association,
|
||||
np.array([[1, 2], ["dd", 4]], dtype=object), 'cramer')
|
||||
|
||||
|
||||
@pytest.mark.parametrize('stat, expected',
|
||||
[('cramer', 0.09222412010290792),
|
||||
('tschuprow', 0.0775509319944633),
|
||||
('pearson', 0.12932925727138758)])
|
||||
def test_assoc(stat, expected):
|
||||
# 2d Array
|
||||
obs1 = np.array([[12, 13, 14, 15, 16],
|
||||
[17, 16, 18, 19, 11],
|
||||
[9, 15, 14, 12, 11]])
|
||||
a = association(observed=obs1, method=stat)
|
||||
assert_allclose(a, expected)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user