asd
This commit is contained in:
31
venv/lib/python3.12/site-packages/scipy/cluster/__init__.py
Normal file
31
venv/lib/python3.12/site-packages/scipy/cluster/__init__.py
Normal file
@ -0,0 +1,31 @@
|
||||
"""
|
||||
=========================================
|
||||
Clustering package (:mod:`scipy.cluster`)
|
||||
=========================================
|
||||
|
||||
.. currentmodule:: scipy.cluster
|
||||
|
||||
.. toctree::
|
||||
:hidden:
|
||||
|
||||
cluster.vq
|
||||
cluster.hierarchy
|
||||
|
||||
Clustering algorithms are useful in information theory, target detection,
|
||||
communications, compression, and other areas. The `vq` module only
|
||||
supports vector quantization and the k-means algorithms.
|
||||
|
||||
The `hierarchy` module provides functions for hierarchical and
|
||||
agglomerative clustering. Its features include generating hierarchical
|
||||
clusters from distance matrices,
|
||||
calculating statistics on clusters, cutting linkages
|
||||
to generate flat clusters, and visualizing clusters with dendrograms.
|
||||
|
||||
"""
|
||||
__all__ = ['vq', 'hierarchy']
|
||||
|
||||
from . import vq, hierarchy
|
||||
|
||||
from scipy._lib._testutils import PytestTester
|
||||
test = PytestTester(__name__)
|
||||
del PytestTester
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
4173
venv/lib/python3.12/site-packages/scipy/cluster/hierarchy.py
Normal file
4173
venv/lib/python3.12/site-packages/scipy/cluster/hierarchy.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,145 @@
|
||||
from numpy import array
|
||||
|
||||
|
||||
Q_X = array([[5.26563660e-01, 3.14160190e-01, 8.00656370e-02],
|
||||
[7.50205180e-01, 4.60299830e-01, 8.98696460e-01],
|
||||
[6.65461230e-01, 6.94011420e-01, 9.10465700e-01],
|
||||
[9.64047590e-01, 1.43082200e-03, 7.39874220e-01],
|
||||
[1.08159060e-01, 5.53028790e-01, 6.63804780e-02],
|
||||
[9.31359130e-01, 8.25424910e-01, 9.52315440e-01],
|
||||
[6.78086960e-01, 3.41903970e-01, 5.61481950e-01],
|
||||
[9.82730940e-01, 7.04605210e-01, 8.70978630e-02],
|
||||
[6.14691610e-01, 4.69989230e-02, 6.02406450e-01],
|
||||
[5.80161260e-01, 9.17354970e-01, 5.88163850e-01],
|
||||
[1.38246310e+00, 1.96358160e+00, 1.94437880e+00],
|
||||
[2.10675860e+00, 1.67148730e+00, 1.34854480e+00],
|
||||
[1.39880070e+00, 1.66142050e+00, 1.32224550e+00],
|
||||
[1.71410460e+00, 1.49176380e+00, 1.45432170e+00],
|
||||
[1.54102340e+00, 1.84374950e+00, 1.64658950e+00],
|
||||
[2.08512480e+00, 1.84524350e+00, 2.17340850e+00],
|
||||
[1.30748740e+00, 1.53801650e+00, 2.16007740e+00],
|
||||
[1.41447700e+00, 1.99329070e+00, 1.99107420e+00],
|
||||
[1.61943490e+00, 1.47703280e+00, 1.89788160e+00],
|
||||
[1.59880600e+00, 1.54988980e+00, 1.57563350e+00],
|
||||
[3.37247380e+00, 2.69635310e+00, 3.39981700e+00],
|
||||
[3.13705120e+00, 3.36528090e+00, 3.06089070e+00],
|
||||
[3.29413250e+00, 3.19619500e+00, 2.90700170e+00],
|
||||
[2.65510510e+00, 3.06785900e+00, 2.97198540e+00],
|
||||
[3.30941040e+00, 2.59283970e+00, 2.57714110e+00],
|
||||
[2.59557220e+00, 3.33477370e+00, 3.08793190e+00],
|
||||
[2.58206180e+00, 3.41615670e+00, 3.26441990e+00],
|
||||
[2.71127000e+00, 2.77032450e+00, 2.63466500e+00],
|
||||
[2.79617850e+00, 3.25473720e+00, 3.41801560e+00],
|
||||
[2.64741750e+00, 2.54538040e+00, 3.25354110e+00]])
|
||||
|
||||
ytdist = array([662., 877., 255., 412., 996., 295., 468., 268., 400., 754.,
|
||||
564., 138., 219., 869., 669.])
|
||||
|
||||
linkage_ytdist_single = array([[2., 5., 138., 2.],
|
||||
[3., 4., 219., 2.],
|
||||
[0., 7., 255., 3.],
|
||||
[1., 8., 268., 4.],
|
||||
[6., 9., 295., 6.]])
|
||||
|
||||
linkage_ytdist_complete = array([[2., 5., 138., 2.],
|
||||
[3., 4., 219., 2.],
|
||||
[1., 6., 400., 3.],
|
||||
[0., 7., 412., 3.],
|
||||
[8., 9., 996., 6.]])
|
||||
|
||||
linkage_ytdist_average = array([[2., 5., 138., 2.],
|
||||
[3., 4., 219., 2.],
|
||||
[0., 7., 333.5, 3.],
|
||||
[1., 6., 347.5, 3.],
|
||||
[8., 9., 680.77777778, 6.]])
|
||||
|
||||
linkage_ytdist_weighted = array([[2., 5., 138., 2.],
|
||||
[3., 4., 219., 2.],
|
||||
[0., 7., 333.5, 3.],
|
||||
[1., 6., 347.5, 3.],
|
||||
[8., 9., 670.125, 6.]])
|
||||
|
||||
# the optimal leaf ordering of linkage_ytdist_single
|
||||
linkage_ytdist_single_olo = array([[5., 2., 138., 2.],
|
||||
[4., 3., 219., 2.],
|
||||
[7., 0., 255., 3.],
|
||||
[1., 8., 268., 4.],
|
||||
[6., 9., 295., 6.]])
|
||||
|
||||
X = array([[1.43054825, -7.5693489],
|
||||
[6.95887839, 6.82293382],
|
||||
[2.87137846, -9.68248579],
|
||||
[7.87974764, -6.05485803],
|
||||
[8.24018364, -6.09495602],
|
||||
[7.39020262, 8.54004355]])
|
||||
|
||||
linkage_X_centroid = array([[3., 4., 0.36265956, 2.],
|
||||
[1., 5., 1.77045373, 2.],
|
||||
[0., 2., 2.55760419, 2.],
|
||||
[6., 8., 6.43614494, 4.],
|
||||
[7., 9., 15.17363237, 6.]])
|
||||
|
||||
linkage_X_median = array([[3., 4., 0.36265956, 2.],
|
||||
[1., 5., 1.77045373, 2.],
|
||||
[0., 2., 2.55760419, 2.],
|
||||
[6., 8., 6.43614494, 4.],
|
||||
[7., 9., 15.17363237, 6.]])
|
||||
|
||||
linkage_X_ward = array([[3., 4., 0.36265956, 2.],
|
||||
[1., 5., 1.77045373, 2.],
|
||||
[0., 2., 2.55760419, 2.],
|
||||
[6., 8., 9.10208346, 4.],
|
||||
[7., 9., 24.7784379, 6.]])
|
||||
|
||||
# the optimal leaf ordering of linkage_X_ward
|
||||
linkage_X_ward_olo = array([[4., 3., 0.36265956, 2.],
|
||||
[5., 1., 1.77045373, 2.],
|
||||
[2., 0., 2.55760419, 2.],
|
||||
[6., 8., 9.10208346, 4.],
|
||||
[7., 9., 24.7784379, 6.]])
|
||||
|
||||
inconsistent_ytdist = {
|
||||
1: array([[138., 0., 1., 0.],
|
||||
[219., 0., 1., 0.],
|
||||
[255., 0., 1., 0.],
|
||||
[268., 0., 1., 0.],
|
||||
[295., 0., 1., 0.]]),
|
||||
2: array([[138., 0., 1., 0.],
|
||||
[219., 0., 1., 0.],
|
||||
[237., 25.45584412, 2., 0.70710678],
|
||||
[261.5, 9.19238816, 2., 0.70710678],
|
||||
[233.66666667, 83.9424406, 3., 0.7306594]]),
|
||||
3: array([[138., 0., 1., 0.],
|
||||
[219., 0., 1., 0.],
|
||||
[237., 25.45584412, 2., 0.70710678],
|
||||
[247.33333333, 25.38372182, 3., 0.81417007],
|
||||
[239., 69.36377537, 4., 0.80733783]]),
|
||||
4: array([[138., 0., 1., 0.],
|
||||
[219., 0., 1., 0.],
|
||||
[237., 25.45584412, 2., 0.70710678],
|
||||
[247.33333333, 25.38372182, 3., 0.81417007],
|
||||
[235., 60.73302232, 5., 0.98793042]])}
|
||||
|
||||
fcluster_inconsistent = {
|
||||
0.8: array([6, 2, 2, 4, 6, 2, 3, 7, 3, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1]),
|
||||
1.0: array([6, 2, 2, 4, 6, 2, 3, 7, 3, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1]),
|
||||
2.0: array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1])}
|
||||
|
||||
fcluster_distance = {
|
||||
0.6: array([4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 6, 6, 6, 6, 6, 7, 6, 6, 6, 6, 3,
|
||||
1, 1, 1, 2, 1, 1, 1, 1, 1]),
|
||||
1.0: array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1]),
|
||||
2.0: array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1])}
|
||||
|
||||
fcluster_maxclust = {
|
||||
8.0: array([5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 7, 7, 7, 7, 7, 8, 7, 7, 7, 7, 4,
|
||||
1, 1, 1, 3, 1, 1, 1, 1, 2]),
|
||||
4.0: array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1]),
|
||||
1.0: array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1])}
|
||||
@ -0,0 +1,202 @@
|
||||
import pytest
|
||||
from pytest import raises as assert_raises
|
||||
import numpy as np
|
||||
from scipy.cluster.hierarchy import DisjointSet
|
||||
import string
|
||||
|
||||
|
||||
def generate_random_token():
|
||||
k = len(string.ascii_letters)
|
||||
tokens = list(np.arange(k, dtype=int))
|
||||
tokens += list(np.arange(k, dtype=float))
|
||||
tokens += list(string.ascii_letters)
|
||||
tokens += [None for i in range(k)]
|
||||
tokens = np.array(tokens, dtype=object)
|
||||
rng = np.random.RandomState(seed=0)
|
||||
|
||||
while 1:
|
||||
size = rng.randint(1, 3)
|
||||
element = rng.choice(tokens, size)
|
||||
if size == 1:
|
||||
yield element[0]
|
||||
else:
|
||||
yield tuple(element)
|
||||
|
||||
|
||||
def get_elements(n):
|
||||
# dict is deterministic without difficulty of comparing numpy ints
|
||||
elements = {}
|
||||
for element in generate_random_token():
|
||||
if element not in elements:
|
||||
elements[element] = len(elements)
|
||||
if len(elements) >= n:
|
||||
break
|
||||
return list(elements.keys())
|
||||
|
||||
|
||||
def test_init():
|
||||
n = 10
|
||||
elements = get_elements(n)
|
||||
dis = DisjointSet(elements)
|
||||
assert dis.n_subsets == n
|
||||
assert list(dis) == elements
|
||||
|
||||
|
||||
def test_len():
|
||||
n = 10
|
||||
elements = get_elements(n)
|
||||
dis = DisjointSet(elements)
|
||||
assert len(dis) == n
|
||||
|
||||
dis.add("dummy")
|
||||
assert len(dis) == n + 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n", [10, 100])
|
||||
def test_contains(n):
|
||||
elements = get_elements(n)
|
||||
dis = DisjointSet(elements)
|
||||
for x in elements:
|
||||
assert x in dis
|
||||
|
||||
assert "dummy" not in dis
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n", [10, 100])
|
||||
def test_add(n):
|
||||
elements = get_elements(n)
|
||||
dis1 = DisjointSet(elements)
|
||||
|
||||
dis2 = DisjointSet()
|
||||
for i, x in enumerate(elements):
|
||||
dis2.add(x)
|
||||
assert len(dis2) == i + 1
|
||||
|
||||
# test idempotency by adding element again
|
||||
dis2.add(x)
|
||||
assert len(dis2) == i + 1
|
||||
|
||||
assert list(dis1) == list(dis2)
|
||||
|
||||
|
||||
def test_element_not_present():
|
||||
elements = get_elements(n=10)
|
||||
dis = DisjointSet(elements)
|
||||
|
||||
with assert_raises(KeyError):
|
||||
dis["dummy"]
|
||||
|
||||
with assert_raises(KeyError):
|
||||
dis.merge(elements[0], "dummy")
|
||||
|
||||
with assert_raises(KeyError):
|
||||
dis.connected(elements[0], "dummy")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("direction", ["forwards", "backwards"])
|
||||
@pytest.mark.parametrize("n", [10, 100])
|
||||
def test_linear_union_sequence(n, direction):
|
||||
elements = get_elements(n)
|
||||
dis = DisjointSet(elements)
|
||||
assert elements == list(dis)
|
||||
|
||||
indices = list(range(n - 1))
|
||||
if direction == "backwards":
|
||||
indices = indices[::-1]
|
||||
|
||||
for it, i in enumerate(indices):
|
||||
assert not dis.connected(elements[i], elements[i + 1])
|
||||
assert dis.merge(elements[i], elements[i + 1])
|
||||
assert dis.connected(elements[i], elements[i + 1])
|
||||
assert dis.n_subsets == n - 1 - it
|
||||
|
||||
roots = [dis[i] for i in elements]
|
||||
if direction == "forwards":
|
||||
assert all(elements[0] == r for r in roots)
|
||||
else:
|
||||
assert all(elements[-2] == r for r in roots)
|
||||
assert not dis.merge(elements[0], elements[-1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n", [10, 100])
|
||||
def test_self_unions(n):
|
||||
elements = get_elements(n)
|
||||
dis = DisjointSet(elements)
|
||||
|
||||
for x in elements:
|
||||
assert dis.connected(x, x)
|
||||
assert not dis.merge(x, x)
|
||||
assert dis.connected(x, x)
|
||||
assert dis.n_subsets == len(elements)
|
||||
|
||||
assert elements == list(dis)
|
||||
roots = [dis[x] for x in elements]
|
||||
assert elements == roots
|
||||
|
||||
|
||||
@pytest.mark.parametrize("order", ["ab", "ba"])
|
||||
@pytest.mark.parametrize("n", [10, 100])
|
||||
def test_equal_size_ordering(n, order):
|
||||
elements = get_elements(n)
|
||||
dis = DisjointSet(elements)
|
||||
|
||||
rng = np.random.RandomState(seed=0)
|
||||
indices = np.arange(n)
|
||||
rng.shuffle(indices)
|
||||
|
||||
for i in range(0, len(indices), 2):
|
||||
a, b = elements[indices[i]], elements[indices[i + 1]]
|
||||
if order == "ab":
|
||||
assert dis.merge(a, b)
|
||||
else:
|
||||
assert dis.merge(b, a)
|
||||
|
||||
expected = elements[min(indices[i], indices[i + 1])]
|
||||
assert dis[a] == expected
|
||||
assert dis[b] == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kmax", [5, 10])
|
||||
def test_binary_tree(kmax):
|
||||
n = 2**kmax
|
||||
elements = get_elements(n)
|
||||
dis = DisjointSet(elements)
|
||||
rng = np.random.RandomState(seed=0)
|
||||
|
||||
for k in 2**np.arange(kmax):
|
||||
for i in range(0, n, 2 * k):
|
||||
r1, r2 = rng.randint(0, k, size=2)
|
||||
a, b = elements[i + r1], elements[i + k + r2]
|
||||
assert not dis.connected(a, b)
|
||||
assert dis.merge(a, b)
|
||||
assert dis.connected(a, b)
|
||||
|
||||
assert elements == list(dis)
|
||||
roots = [dis[i] for i in elements]
|
||||
expected_indices = np.arange(n) - np.arange(n) % (2 * k)
|
||||
expected = [elements[i] for i in expected_indices]
|
||||
assert roots == expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n", [10, 100])
|
||||
def test_subsets(n):
|
||||
elements = get_elements(n)
|
||||
dis = DisjointSet(elements)
|
||||
|
||||
rng = np.random.RandomState(seed=0)
|
||||
for i, j in rng.randint(0, n, (n, 2)):
|
||||
x = elements[i]
|
||||
y = elements[j]
|
||||
|
||||
expected = {element for element in dis if {dis[element]} == {dis[x]}}
|
||||
assert dis.subset_size(x) == len(dis.subset(x))
|
||||
assert expected == dis.subset(x)
|
||||
|
||||
expected = {dis[element]: set() for element in dis}
|
||||
for element in dis:
|
||||
expected[dis[element]].add(element)
|
||||
expected = list(expected.values())
|
||||
assert expected == dis.subsets()
|
||||
|
||||
dis.merge(x, y)
|
||||
assert dis.subset(x) == dis.subset(y)
|
||||
File diff suppressed because it is too large
Load Diff
435
venv/lib/python3.12/site-packages/scipy/cluster/tests/test_vq.py
Normal file
435
venv/lib/python3.12/site-packages/scipy/cluster/tests/test_vq.py
Normal file
@ -0,0 +1,435 @@
|
||||
import warnings
|
||||
import sys
|
||||
from copy import deepcopy
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import (
|
||||
assert_array_equal, assert_equal, assert_, suppress_warnings
|
||||
)
|
||||
import pytest
|
||||
from pytest import raises as assert_raises
|
||||
|
||||
from scipy.cluster.vq import (kmeans, kmeans2, py_vq, vq, whiten,
|
||||
ClusterError, _krandinit)
|
||||
from scipy.cluster import _vq
|
||||
from scipy.conftest import array_api_compatible
|
||||
from scipy.sparse._sputils import matrix
|
||||
|
||||
from scipy._lib._array_api import (
|
||||
SCIPY_ARRAY_API, copy, cov, xp_assert_close, xp_assert_equal
|
||||
)
|
||||
|
||||
pytestmark = [array_api_compatible, pytest.mark.usefixtures("skip_xp_backends")]
|
||||
skip_xp_backends = pytest.mark.skip_xp_backends
|
||||
|
||||
TESTDATA_2D = np.array([
|
||||
-2.2, 1.17, -1.63, 1.69, -2.04, 4.38, -3.09, 0.95, -1.7, 4.79, -1.68, 0.68,
|
||||
-2.26, 3.34, -2.29, 2.55, -1.72, -0.72, -1.99, 2.34, -2.75, 3.43, -2.45,
|
||||
2.41, -4.26, 3.65, -1.57, 1.87, -1.96, 4.03, -3.01, 3.86, -2.53, 1.28,
|
||||
-4.0, 3.95, -1.62, 1.25, -3.42, 3.17, -1.17, 0.12, -3.03, -0.27, -2.07,
|
||||
-0.55, -1.17, 1.34, -2.82, 3.08, -2.44, 0.24, -1.71, 2.48, -5.23, 4.29,
|
||||
-2.08, 3.69, -1.89, 3.62, -2.09, 0.26, -0.92, 1.07, -2.25, 0.88, -2.25,
|
||||
2.02, -4.31, 3.86, -2.03, 3.42, -2.76, 0.3, -2.48, -0.29, -3.42, 3.21,
|
||||
-2.3, 1.73, -2.84, 0.69, -1.81, 2.48, -5.24, 4.52, -2.8, 1.31, -1.67,
|
||||
-2.34, -1.18, 2.17, -2.17, 2.82, -1.85, 2.25, -2.45, 1.86, -6.79, 3.94,
|
||||
-2.33, 1.89, -1.55, 2.08, -1.36, 0.93, -2.51, 2.74, -2.39, 3.92, -3.33,
|
||||
2.99, -2.06, -0.9, -2.83, 3.35, -2.59, 3.05, -2.36, 1.85, -1.69, 1.8,
|
||||
-1.39, 0.66, -2.06, 0.38, -1.47, 0.44, -4.68, 3.77, -5.58, 3.44, -2.29,
|
||||
2.24, -1.04, -0.38, -1.85, 4.23, -2.88, 0.73, -2.59, 1.39, -1.34, 1.75,
|
||||
-1.95, 1.3, -2.45, 3.09, -1.99, 3.41, -5.55, 5.21, -1.73, 2.52, -2.17,
|
||||
0.85, -2.06, 0.49, -2.54, 2.07, -2.03, 1.3, -3.23, 3.09, -1.55, 1.44,
|
||||
-0.81, 1.1, -2.99, 2.92, -1.59, 2.18, -2.45, -0.73, -3.12, -1.3, -2.83,
|
||||
0.2, -2.77, 3.24, -1.98, 1.6, -4.59, 3.39, -4.85, 3.75, -2.25, 1.71, -3.28,
|
||||
3.38, -1.74, 0.88, -2.41, 1.92, -2.24, 1.19, -2.48, 1.06, -1.68, -0.62,
|
||||
-1.3, 0.39, -1.78, 2.35, -3.54, 2.44, -1.32, 0.66, -2.38, 2.76, -2.35,
|
||||
3.95, -1.86, 4.32, -2.01, -1.23, -1.79, 2.76, -2.13, -0.13, -5.25, 3.84,
|
||||
-2.24, 1.59, -4.85, 2.96, -2.41, 0.01, -0.43, 0.13, -3.92, 2.91, -1.75,
|
||||
-0.53, -1.69, 1.69, -1.09, 0.15, -2.11, 2.17, -1.53, 1.22, -2.1, -0.86,
|
||||
-2.56, 2.28, -3.02, 3.33, -1.12, 3.86, -2.18, -1.19, -3.03, 0.79, -0.83,
|
||||
0.97, -3.19, 1.45, -1.34, 1.28, -2.52, 4.22, -4.53, 3.22, -1.97, 1.75,
|
||||
-2.36, 3.19, -0.83, 1.53, -1.59, 1.86, -2.17, 2.3, -1.63, 2.71, -2.03,
|
||||
3.75, -2.57, -0.6, -1.47, 1.33, -1.95, 0.7, -1.65, 1.27, -1.42, 1.09, -3.0,
|
||||
3.87, -2.51, 3.06, -2.6, 0.74, -1.08, -0.03, -2.44, 1.31, -2.65, 2.99,
|
||||
-1.84, 1.65, -4.76, 3.75, -2.07, 3.98, -2.4, 2.67, -2.21, 1.49, -1.21,
|
||||
1.22, -5.29, 2.38, -2.85, 2.28, -5.6, 3.78, -2.7, 0.8, -1.81, 3.5, -3.75,
|
||||
4.17, -1.29, 2.99, -5.92, 3.43, -1.83, 1.23, -1.24, -1.04, -2.56, 2.37,
|
||||
-3.26, 0.39, -4.63, 2.51, -4.52, 3.04, -1.7, 0.36, -1.41, 0.04, -2.1, 1.0,
|
||||
-1.87, 3.78, -4.32, 3.59, -2.24, 1.38, -1.99, -0.22, -1.87, 1.95, -0.84,
|
||||
2.17, -5.38, 3.56, -1.27, 2.9, -1.79, 3.31, -5.47, 3.85, -1.44, 3.69,
|
||||
-2.02, 0.37, -1.29, 0.33, -2.34, 2.56, -1.74, -1.27, -1.97, 1.22, -2.51,
|
||||
-0.16, -1.64, -0.96, -2.99, 1.4, -1.53, 3.31, -2.24, 0.45, -2.46, 1.71,
|
||||
-2.88, 1.56, -1.63, 1.46, -1.41, 0.68, -1.96, 2.76, -1.61,
|
||||
2.11]).reshape((200, 2))
|
||||
|
||||
|
||||
# Global data
|
||||
X = np.array([[3.0, 3], [4, 3], [4, 2],
|
||||
[9, 2], [5, 1], [6, 2], [9, 4],
|
||||
[5, 2], [5, 4], [7, 4], [6, 5]])
|
||||
|
||||
CODET1 = np.array([[3.0000, 3.0000],
|
||||
[6.2000, 4.0000],
|
||||
[5.8000, 1.8000]])
|
||||
|
||||
CODET2 = np.array([[11.0/3, 8.0/3],
|
||||
[6.7500, 4.2500],
|
||||
[6.2500, 1.7500]])
|
||||
|
||||
LABEL1 = np.array([0, 1, 2, 2, 2, 2, 1, 2, 1, 1, 1])
|
||||
|
||||
|
||||
class TestWhiten:
|
||||
|
||||
def test_whiten(self, xp):
|
||||
desired = xp.asarray([[5.08738849, 2.97091878],
|
||||
[3.19909255, 0.69660580],
|
||||
[4.51041982, 0.02640918],
|
||||
[4.38567074, 0.95120889],
|
||||
[2.32191480, 1.63195503]])
|
||||
|
||||
obs = xp.asarray([[0.98744510, 0.82766775],
|
||||
[0.62093317, 0.19406729],
|
||||
[0.87545741, 0.00735733],
|
||||
[0.85124403, 0.26499712],
|
||||
[0.45067590, 0.45464607]])
|
||||
xp_assert_close(whiten(obs), desired, rtol=1e-5)
|
||||
|
||||
@skip_xp_backends('jax.numpy',
|
||||
reasons=['jax arrays do not support item assignment'])
|
||||
def test_whiten_zero_std(self, xp):
|
||||
desired = xp.asarray([[0., 1.0, 2.86666544],
|
||||
[0., 1.0, 1.32460034],
|
||||
[0., 1.0, 3.74382172]])
|
||||
|
||||
obs = xp.asarray([[0., 1., 0.74109533],
|
||||
[0., 1., 0.34243798],
|
||||
[0., 1., 0.96785929]])
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
warnings.simplefilter('always')
|
||||
|
||||
xp_assert_close(whiten(obs), desired, rtol=1e-5)
|
||||
|
||||
assert_equal(len(w), 1)
|
||||
assert_(issubclass(w[-1].category, RuntimeWarning))
|
||||
|
||||
def test_whiten_not_finite(self, xp):
|
||||
for bad_value in xp.nan, xp.inf, -xp.inf:
|
||||
obs = xp.asarray([[0.98744510, bad_value],
|
||||
[0.62093317, 0.19406729],
|
||||
[0.87545741, 0.00735733],
|
||||
[0.85124403, 0.26499712],
|
||||
[0.45067590, 0.45464607]])
|
||||
assert_raises(ValueError, whiten, obs)
|
||||
|
||||
@pytest.mark.skipif(SCIPY_ARRAY_API,
|
||||
reason='`np.matrix` unsupported in array API mode')
|
||||
def test_whiten_not_finite_matrix(self, xp):
|
||||
for bad_value in np.nan, np.inf, -np.inf:
|
||||
obs = matrix([[0.98744510, bad_value],
|
||||
[0.62093317, 0.19406729],
|
||||
[0.87545741, 0.00735733],
|
||||
[0.85124403, 0.26499712],
|
||||
[0.45067590, 0.45464607]])
|
||||
assert_raises(ValueError, whiten, obs)
|
||||
|
||||
|
||||
class TestVq:
|
||||
|
||||
@skip_xp_backends(cpu_only=True)
|
||||
def test_py_vq(self, xp):
|
||||
initc = np.concatenate([[X[0]], [X[1]], [X[2]]])
|
||||
# label1.dtype varies between int32 and int64 over platforms
|
||||
label1 = py_vq(xp.asarray(X), xp.asarray(initc))[0]
|
||||
xp_assert_equal(label1, xp.asarray(LABEL1, dtype=xp.int64),
|
||||
check_dtype=False)
|
||||
|
||||
@pytest.mark.skipif(SCIPY_ARRAY_API,
|
||||
reason='`np.matrix` unsupported in array API mode')
|
||||
def test_py_vq_matrix(self, xp):
|
||||
initc = np.concatenate([[X[0]], [X[1]], [X[2]]])
|
||||
# label1.dtype varies between int32 and int64 over platforms
|
||||
label1 = py_vq(matrix(X), matrix(initc))[0]
|
||||
assert_array_equal(label1, LABEL1)
|
||||
|
||||
@skip_xp_backends(np_only=True, reasons=['`_vq` only supports NumPy backend'])
|
||||
def test_vq(self, xp):
|
||||
initc = np.concatenate([[X[0]], [X[1]], [X[2]]])
|
||||
label1, _ = _vq.vq(xp.asarray(X), xp.asarray(initc))
|
||||
assert_array_equal(label1, LABEL1)
|
||||
_, _ = vq(xp.asarray(X), xp.asarray(initc))
|
||||
|
||||
@pytest.mark.skipif(SCIPY_ARRAY_API,
|
||||
reason='`np.matrix` unsupported in array API mode')
|
||||
def test_vq_matrix(self, xp):
|
||||
initc = np.concatenate([[X[0]], [X[1]], [X[2]]])
|
||||
label1, _ = _vq.vq(matrix(X), matrix(initc))
|
||||
assert_array_equal(label1, LABEL1)
|
||||
_, _ = vq(matrix(X), matrix(initc))
|
||||
|
||||
@skip_xp_backends(cpu_only=True)
|
||||
def test_vq_1d(self, xp):
|
||||
# Test special rank 1 vq algo, python implementation.
|
||||
data = X[:, 0]
|
||||
initc = data[:3]
|
||||
a, b = _vq.vq(data, initc)
|
||||
data = xp.asarray(data)
|
||||
initc = xp.asarray(initc)
|
||||
ta, tb = py_vq(data[:, np.newaxis], initc[:, np.newaxis])
|
||||
# ta.dtype varies between int32 and int64 over platforms
|
||||
xp_assert_equal(ta, xp.asarray(a, dtype=xp.int64), check_dtype=False)
|
||||
xp_assert_equal(tb, xp.asarray(b))
|
||||
|
||||
@skip_xp_backends(np_only=True, reasons=['`_vq` only supports NumPy backend'])
|
||||
def test__vq_sametype(self, xp):
|
||||
a = xp.asarray([1.0, 2.0], dtype=xp.float64)
|
||||
b = a.astype(xp.float32)
|
||||
assert_raises(TypeError, _vq.vq, a, b)
|
||||
|
||||
@skip_xp_backends(np_only=True, reasons=['`_vq` only supports NumPy backend'])
|
||||
def test__vq_invalid_type(self, xp):
|
||||
a = xp.asarray([1, 2], dtype=int)
|
||||
assert_raises(TypeError, _vq.vq, a, a)
|
||||
|
||||
@skip_xp_backends(cpu_only=True)
|
||||
def test_vq_large_nfeat(self, xp):
|
||||
X = np.random.rand(20, 20)
|
||||
code_book = np.random.rand(3, 20)
|
||||
|
||||
codes0, dis0 = _vq.vq(X, code_book)
|
||||
codes1, dis1 = py_vq(
|
||||
xp.asarray(X), xp.asarray(code_book)
|
||||
)
|
||||
xp_assert_close(dis1, xp.asarray(dis0), rtol=1e-5)
|
||||
# codes1.dtype varies between int32 and int64 over platforms
|
||||
xp_assert_equal(codes1, xp.asarray(codes0, dtype=xp.int64), check_dtype=False)
|
||||
|
||||
X = X.astype(np.float32)
|
||||
code_book = code_book.astype(np.float32)
|
||||
|
||||
codes0, dis0 = _vq.vq(X, code_book)
|
||||
codes1, dis1 = py_vq(
|
||||
xp.asarray(X), xp.asarray(code_book)
|
||||
)
|
||||
xp_assert_close(dis1, xp.asarray(dis0, dtype=xp.float64), rtol=1e-5)
|
||||
# codes1.dtype varies between int32 and int64 over platforms
|
||||
xp_assert_equal(codes1, xp.asarray(codes0, dtype=xp.int64), check_dtype=False)
|
||||
|
||||
@skip_xp_backends(cpu_only=True)
|
||||
def test_vq_large_features(self, xp):
|
||||
X = np.random.rand(10, 5) * 1000000
|
||||
code_book = np.random.rand(2, 5) * 1000000
|
||||
|
||||
codes0, dis0 = _vq.vq(X, code_book)
|
||||
codes1, dis1 = py_vq(
|
||||
xp.asarray(X), xp.asarray(code_book)
|
||||
)
|
||||
xp_assert_close(dis1, xp.asarray(dis0), rtol=1e-5)
|
||||
# codes1.dtype varies between int32 and int64 over platforms
|
||||
xp_assert_equal(codes1, xp.asarray(codes0, dtype=xp.int64), check_dtype=False)
|
||||
|
||||
|
||||
# Whole class skipped on GPU for now;
|
||||
# once pdist/cdist are hooked up for CuPy, more tests will work
|
||||
@skip_xp_backends(cpu_only=True)
|
||||
class TestKMean:
|
||||
|
||||
def test_large_features(self, xp):
|
||||
# Generate a data set with large values, and run kmeans on it to
|
||||
# (regression for 1077).
|
||||
d = 300
|
||||
n = 100
|
||||
|
||||
m1 = np.random.randn(d)
|
||||
m2 = np.random.randn(d)
|
||||
x = 10000 * np.random.randn(n, d) - 20000 * m1
|
||||
y = 10000 * np.random.randn(n, d) + 20000 * m2
|
||||
|
||||
data = np.empty((x.shape[0] + y.shape[0], d), np.float64)
|
||||
data[:x.shape[0]] = x
|
||||
data[x.shape[0]:] = y
|
||||
|
||||
kmeans(xp.asarray(data), 2)
|
||||
|
||||
def test_kmeans_simple(self, xp):
|
||||
np.random.seed(54321)
|
||||
initc = np.concatenate([[X[0]], [X[1]], [X[2]]])
|
||||
code1 = kmeans(xp.asarray(X), xp.asarray(initc), iter=1)[0]
|
||||
xp_assert_close(code1, xp.asarray(CODET2))
|
||||
|
||||
@pytest.mark.skipif(SCIPY_ARRAY_API,
|
||||
reason='`np.matrix` unsupported in array API mode')
|
||||
def test_kmeans_simple_matrix(self, xp):
|
||||
np.random.seed(54321)
|
||||
initc = np.concatenate([[X[0]], [X[1]], [X[2]]])
|
||||
code1 = kmeans(matrix(X), matrix(initc), iter=1)[0]
|
||||
xp_assert_close(code1, CODET2)
|
||||
|
||||
def test_kmeans_lost_cluster(self, xp):
|
||||
# This will cause kmeans to have a cluster with no points.
|
||||
data = xp.asarray(TESTDATA_2D)
|
||||
initk = xp.asarray([[-1.8127404, -0.67128041],
|
||||
[2.04621601, 0.07401111],
|
||||
[-2.31149087, -0.05160469]])
|
||||
|
||||
kmeans(data, initk)
|
||||
with suppress_warnings() as sup:
|
||||
sup.filter(UserWarning,
|
||||
"One of the clusters is empty. Re-run kmeans with a "
|
||||
"different initialization")
|
||||
kmeans2(data, initk, missing='warn')
|
||||
|
||||
assert_raises(ClusterError, kmeans2, data, initk, missing='raise')
|
||||
|
||||
def test_kmeans2_simple(self, xp):
|
||||
np.random.seed(12345678)
|
||||
initc = xp.asarray(np.concatenate([[X[0]], [X[1]], [X[2]]]))
|
||||
arrays = [xp.asarray] if SCIPY_ARRAY_API else [np.asarray, matrix]
|
||||
for tp in arrays:
|
||||
code1 = kmeans2(tp(X), tp(initc), iter=1)[0]
|
||||
code2 = kmeans2(tp(X), tp(initc), iter=2)[0]
|
||||
|
||||
xp_assert_close(code1, xp.asarray(CODET1))
|
||||
xp_assert_close(code2, xp.asarray(CODET2))
|
||||
|
||||
@pytest.mark.skipif(SCIPY_ARRAY_API,
|
||||
reason='`np.matrix` unsupported in array API mode')
|
||||
def test_kmeans2_simple_matrix(self, xp):
|
||||
np.random.seed(12345678)
|
||||
initc = xp.asarray(np.concatenate([[X[0]], [X[1]], [X[2]]]))
|
||||
code1 = kmeans2(matrix(X), matrix(initc), iter=1)[0]
|
||||
code2 = kmeans2(matrix(X), matrix(initc), iter=2)[0]
|
||||
|
||||
xp_assert_close(code1, CODET1)
|
||||
xp_assert_close(code2, CODET2)
|
||||
|
||||
def test_kmeans2_rank1(self, xp):
|
||||
data = xp.asarray(TESTDATA_2D)
|
||||
data1 = data[:, 0]
|
||||
|
||||
initc = data1[:3]
|
||||
code = copy(initc, xp=xp)
|
||||
kmeans2(data1, code, iter=1)[0]
|
||||
kmeans2(data1, code, iter=2)[0]
|
||||
|
||||
def test_kmeans2_rank1_2(self, xp):
|
||||
data = xp.asarray(TESTDATA_2D)
|
||||
data1 = data[:, 0]
|
||||
kmeans2(data1, 2, iter=1)
|
||||
|
||||
def test_kmeans2_high_dim(self, xp):
|
||||
# test kmeans2 when the number of dimensions exceeds the number
|
||||
# of input points
|
||||
data = xp.asarray(TESTDATA_2D)
|
||||
data = xp.reshape(data, (20, 20))[:10, :]
|
||||
kmeans2(data, 2)
|
||||
|
||||
@skip_xp_backends('jax.numpy',
|
||||
reasons=['jax arrays do not support item assignment'],
|
||||
cpu_only=True)
|
||||
def test_kmeans2_init(self, xp):
|
||||
np.random.seed(12345)
|
||||
data = xp.asarray(TESTDATA_2D)
|
||||
k = 3
|
||||
|
||||
kmeans2(data, k, minit='points')
|
||||
kmeans2(data[:, 1], k, minit='points') # special case (1-D)
|
||||
|
||||
kmeans2(data, k, minit='++')
|
||||
kmeans2(data[:, 1], k, minit='++') # special case (1-D)
|
||||
|
||||
# minit='random' can give warnings, filter those
|
||||
with suppress_warnings() as sup:
|
||||
sup.filter(message="One of the clusters is empty. Re-run.")
|
||||
kmeans2(data, k, minit='random')
|
||||
kmeans2(data[:, 1], k, minit='random') # special case (1-D)
|
||||
|
||||
@pytest.mark.skipif(sys.platform == 'win32',
|
||||
reason='Fails with MemoryError in Wine.')
|
||||
def test_krandinit(self, xp):
|
||||
data = xp.asarray(TESTDATA_2D)
|
||||
datas = [xp.reshape(data, (200, 2)),
|
||||
xp.reshape(data, (20, 20))[:10, :]]
|
||||
k = int(1e6)
|
||||
for data in datas:
|
||||
rng = np.random.default_rng(1234)
|
||||
init = _krandinit(data, k, rng, xp)
|
||||
orig_cov = cov(data.T)
|
||||
init_cov = cov(init.T)
|
||||
xp_assert_close(orig_cov, init_cov, atol=1.1e-2)
|
||||
|
||||
def test_kmeans2_empty(self, xp):
|
||||
# Regression test for gh-1032.
|
||||
assert_raises(ValueError, kmeans2, xp.asarray([]), 2)
|
||||
|
||||
def test_kmeans_0k(self, xp):
|
||||
# Regression test for gh-1073: fail when k arg is 0.
|
||||
assert_raises(ValueError, kmeans, xp.asarray(X), 0)
|
||||
assert_raises(ValueError, kmeans2, xp.asarray(X), 0)
|
||||
assert_raises(ValueError, kmeans2, xp.asarray(X), xp.asarray([]))
|
||||
|
||||
def test_kmeans_large_thres(self, xp):
|
||||
# Regression test for gh-1774
|
||||
x = xp.asarray([1, 2, 3, 4, 10], dtype=xp.float64)
|
||||
res = kmeans(x, 1, thresh=1e16)
|
||||
xp_assert_close(res[0], xp.asarray([4.], dtype=xp.float64))
|
||||
xp_assert_close(res[1], xp.asarray(2.3999999999999999, dtype=xp.float64)[()])
|
||||
|
||||
@skip_xp_backends('jax.numpy',
|
||||
reasons=['jax arrays do not support item assignment'],
|
||||
cpu_only=True)
|
||||
def test_kmeans2_kpp_low_dim(self, xp):
|
||||
# Regression test for gh-11462
|
||||
prev_res = xp.asarray([[-1.95266667, 0.898],
|
||||
[-3.153375, 3.3945]], dtype=xp.float64)
|
||||
np.random.seed(42)
|
||||
res, _ = kmeans2(xp.asarray(TESTDATA_2D), 2, minit='++')
|
||||
xp_assert_close(res, prev_res)
|
||||
|
||||
@skip_xp_backends('jax.numpy',
|
||||
reasons=['jax arrays do not support item assignment'],
|
||||
cpu_only=True)
|
||||
def test_kmeans2_kpp_high_dim(self, xp):
|
||||
# Regression test for gh-11462
|
||||
n_dim = 100
|
||||
size = 10
|
||||
centers = np.vstack([5 * np.ones(n_dim),
|
||||
-5 * np.ones(n_dim)])
|
||||
np.random.seed(42)
|
||||
data = np.vstack([
|
||||
np.random.multivariate_normal(centers[0], np.eye(n_dim), size=size),
|
||||
np.random.multivariate_normal(centers[1], np.eye(n_dim), size=size)
|
||||
])
|
||||
|
||||
data = xp.asarray(data)
|
||||
res, _ = kmeans2(data, 2, minit='++')
|
||||
xp_assert_equal(xp.sign(res), xp.sign(xp.asarray(centers)))
|
||||
|
||||
def test_kmeans_diff_convergence(self, xp):
|
||||
# Regression test for gh-8727
|
||||
obs = xp.asarray([-3, -1, 0, 1, 1, 8], dtype=xp.float64)
|
||||
res = kmeans(obs, xp.asarray([-3., 0.99]))
|
||||
xp_assert_close(res[0], xp.asarray([-0.4, 8.], dtype=xp.float64))
|
||||
xp_assert_close(res[1], xp.asarray(1.0666666666666667, dtype=xp.float64)[()])
|
||||
|
||||
@skip_xp_backends('jax.numpy',
|
||||
reasons=['jax arrays do not support item assignment'],
|
||||
cpu_only=True)
|
||||
def test_kmeans_and_kmeans2_random_seed(self, xp):
|
||||
|
||||
seed_list = [
|
||||
1234, np.random.RandomState(1234), np.random.default_rng(1234)
|
||||
]
|
||||
|
||||
for seed in seed_list:
|
||||
seed1 = deepcopy(seed)
|
||||
seed2 = deepcopy(seed)
|
||||
data = xp.asarray(TESTDATA_2D)
|
||||
# test for kmeans
|
||||
res1, _ = kmeans(data, 2, seed=seed1)
|
||||
res2, _ = kmeans(data, 2, seed=seed2)
|
||||
xp_assert_close(res1, res2) # should be same results
|
||||
# test for kmeans2
|
||||
for minit in ["random", "points", "++"]:
|
||||
res1, _ = kmeans2(data, 2, minit=minit, seed=seed1)
|
||||
res2, _ = kmeans2(data, 2, minit=minit, seed=seed2)
|
||||
xp_assert_close(res1, res2) # should be same results
|
||||
835
venv/lib/python3.12/site-packages/scipy/cluster/vq.py
Normal file
835
venv/lib/python3.12/site-packages/scipy/cluster/vq.py
Normal file
@ -0,0 +1,835 @@
|
||||
"""
|
||||
K-means clustering and vector quantization (:mod:`scipy.cluster.vq`)
|
||||
====================================================================
|
||||
|
||||
Provides routines for k-means clustering, generating code books
|
||||
from k-means models and quantizing vectors by comparing them with
|
||||
centroids in a code book.
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated/
|
||||
|
||||
whiten -- Normalize a group of observations so each feature has unit variance
|
||||
vq -- Calculate code book membership of a set of observation vectors
|
||||
kmeans -- Perform k-means on a set of observation vectors forming k clusters
|
||||
kmeans2 -- A different implementation of k-means with more methods
|
||||
-- for initializing centroids
|
||||
|
||||
Background information
|
||||
----------------------
|
||||
The k-means algorithm takes as input the number of clusters to
|
||||
generate, k, and a set of observation vectors to cluster. It
|
||||
returns a set of centroids, one for each of the k clusters. An
|
||||
observation vector is classified with the cluster number or
|
||||
centroid index of the centroid closest to it.
|
||||
|
||||
A vector v belongs to cluster i if it is closer to centroid i than
|
||||
any other centroid. If v belongs to i, we say centroid i is the
|
||||
dominating centroid of v. The k-means algorithm tries to
|
||||
minimize distortion, which is defined as the sum of the squared distances
|
||||
between each observation vector and its dominating centroid.
|
||||
The minimization is achieved by iteratively reclassifying
|
||||
the observations into clusters and recalculating the centroids until
|
||||
a configuration is reached in which the centroids are stable. One can
|
||||
also define a maximum number of iterations.
|
||||
|
||||
Since vector quantization is a natural application for k-means,
|
||||
information theory terminology is often used. The centroid index
|
||||
or cluster index is also referred to as a "code" and the table
|
||||
mapping codes to centroids and, vice versa, is often referred to as a
|
||||
"code book". The result of k-means, a set of centroids, can be
|
||||
used to quantize vectors. Quantization aims to find an encoding of
|
||||
vectors that reduces the expected distortion.
|
||||
|
||||
All routines expect obs to be an M by N array, where the rows are
|
||||
the observation vectors. The codebook is a k by N array, where the
|
||||
ith row is the centroid of code word i. The observation vectors
|
||||
and centroids have the same feature dimension.
|
||||
|
||||
As an example, suppose we wish to compress a 24-bit color image
|
||||
(each pixel is represented by one byte for red, one for blue, and
|
||||
one for green) before sending it over the web. By using a smaller
|
||||
8-bit encoding, we can reduce the amount of data by two
|
||||
thirds. Ideally, the colors for each of the 256 possible 8-bit
|
||||
encoding values should be chosen to minimize distortion of the
|
||||
color. Running k-means with k=256 generates a code book of 256
|
||||
codes, which fills up all possible 8-bit sequences. Instead of
|
||||
sending a 3-byte value for each pixel, the 8-bit centroid index
|
||||
(or code word) of the dominating centroid is transmitted. The code
|
||||
book is also sent over the wire so each 8-bit code can be
|
||||
translated back to a 24-bit pixel value representation. If the
|
||||
image of interest was of an ocean, we would expect many 24-bit
|
||||
blues to be represented by 8-bit codes. If it was an image of a
|
||||
human face, more flesh-tone colors would be represented in the
|
||||
code book.
|
||||
|
||||
"""
|
||||
import warnings
|
||||
import numpy as np
|
||||
from collections import deque
|
||||
from scipy._lib._array_api import (
|
||||
_asarray, array_namespace, size, atleast_nd, copy, cov
|
||||
)
|
||||
from scipy._lib._util import check_random_state, rng_integers
|
||||
from scipy.spatial.distance import cdist
|
||||
|
||||
from . import _vq
|
||||
|
||||
__docformat__ = 'restructuredtext'
|
||||
|
||||
__all__ = ['whiten', 'vq', 'kmeans', 'kmeans2']
|
||||
|
||||
|
||||
class ClusterError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def whiten(obs, check_finite=True):
|
||||
"""
|
||||
Normalize a group of observations on a per feature basis.
|
||||
|
||||
Before running k-means, it is beneficial to rescale each feature
|
||||
dimension of the observation set by its standard deviation (i.e. "whiten"
|
||||
it - as in "white noise" where each frequency has equal power).
|
||||
Each feature is divided by its standard deviation across all observations
|
||||
to give it unit variance.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obs : ndarray
|
||||
Each row of the array is an observation. The
|
||||
columns are the features seen during each observation.
|
||||
|
||||
>>> # f0 f1 f2
|
||||
>>> obs = [[ 1., 1., 1.], #o0
|
||||
... [ 2., 2., 2.], #o1
|
||||
... [ 3., 3., 3.], #o2
|
||||
... [ 4., 4., 4.]] #o3
|
||||
|
||||
check_finite : bool, optional
|
||||
Whether to check that the input matrices contain only finite numbers.
|
||||
Disabling may give a performance gain, but may result in problems
|
||||
(crashes, non-termination) if the inputs do contain infinities or NaNs.
|
||||
Default: True
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : ndarray
|
||||
Contains the values in `obs` scaled by the standard deviation
|
||||
of each column.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy.cluster.vq import whiten
|
||||
>>> features = np.array([[1.9, 2.3, 1.7],
|
||||
... [1.5, 2.5, 2.2],
|
||||
... [0.8, 0.6, 1.7,]])
|
||||
>>> whiten(features)
|
||||
array([[ 4.17944278, 2.69811351, 7.21248917],
|
||||
[ 3.29956009, 2.93273208, 9.33380951],
|
||||
[ 1.75976538, 0.7038557 , 7.21248917]])
|
||||
|
||||
"""
|
||||
xp = array_namespace(obs)
|
||||
obs = _asarray(obs, check_finite=check_finite, xp=xp)
|
||||
std_dev = xp.std(obs, axis=0)
|
||||
zero_std_mask = std_dev == 0
|
||||
if xp.any(zero_std_mask):
|
||||
std_dev[zero_std_mask] = 1.0
|
||||
warnings.warn("Some columns have standard deviation zero. "
|
||||
"The values of these columns will not change.",
|
||||
RuntimeWarning, stacklevel=2)
|
||||
return obs / std_dev
|
||||
|
||||
|
||||
def vq(obs, code_book, check_finite=True):
|
||||
"""
|
||||
Assign codes from a code book to observations.
|
||||
|
||||
Assigns a code from a code book to each observation. Each
|
||||
observation vector in the 'M' by 'N' `obs` array is compared with the
|
||||
centroids in the code book and assigned the code of the closest
|
||||
centroid.
|
||||
|
||||
The features in `obs` should have unit variance, which can be
|
||||
achieved by passing them through the whiten function. The code
|
||||
book can be created with the k-means algorithm or a different
|
||||
encoding algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obs : ndarray
|
||||
Each row of the 'M' x 'N' array is an observation. The columns are
|
||||
the "features" seen during each observation. The features must be
|
||||
whitened first using the whiten function or something equivalent.
|
||||
code_book : ndarray
|
||||
The code book is usually generated using the k-means algorithm.
|
||||
Each row of the array holds a different code, and the columns are
|
||||
the features of the code.
|
||||
|
||||
>>> # f0 f1 f2 f3
|
||||
>>> code_book = [
|
||||
... [ 1., 2., 3., 4.], #c0
|
||||
... [ 1., 2., 3., 4.], #c1
|
||||
... [ 1., 2., 3., 4.]] #c2
|
||||
|
||||
check_finite : bool, optional
|
||||
Whether to check that the input matrices contain only finite numbers.
|
||||
Disabling may give a performance gain, but may result in problems
|
||||
(crashes, non-termination) if the inputs do contain infinities or NaNs.
|
||||
Default: True
|
||||
|
||||
Returns
|
||||
-------
|
||||
code : ndarray
|
||||
A length M array holding the code book index for each observation.
|
||||
dist : ndarray
|
||||
The distortion (distance) between the observation and its nearest
|
||||
code.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy.cluster.vq import vq
|
||||
>>> code_book = np.array([[1., 1., 1.],
|
||||
... [2., 2., 2.]])
|
||||
>>> features = np.array([[1.9, 2.3, 1.7],
|
||||
... [1.5, 2.5, 2.2],
|
||||
... [0.8, 0.6, 1.7]])
|
||||
>>> vq(features, code_book)
|
||||
(array([1, 1, 0], dtype=int32), array([0.43588989, 0.73484692, 0.83066239]))
|
||||
|
||||
"""
|
||||
xp = array_namespace(obs, code_book)
|
||||
obs = _asarray(obs, xp=xp, check_finite=check_finite)
|
||||
code_book = _asarray(code_book, xp=xp, check_finite=check_finite)
|
||||
ct = xp.result_type(obs, code_book)
|
||||
|
||||
c_obs = xp.astype(obs, ct, copy=False)
|
||||
c_code_book = xp.astype(code_book, ct, copy=False)
|
||||
|
||||
if xp.isdtype(ct, kind='real floating'):
|
||||
c_obs = np.asarray(c_obs)
|
||||
c_code_book = np.asarray(c_code_book)
|
||||
result = _vq.vq(c_obs, c_code_book)
|
||||
return xp.asarray(result[0]), xp.asarray(result[1])
|
||||
return py_vq(obs, code_book, check_finite=False)
|
||||
|
||||
|
||||
def py_vq(obs, code_book, check_finite=True):
|
||||
""" Python version of vq algorithm.
|
||||
|
||||
The algorithm computes the Euclidean distance between each
|
||||
observation and every frame in the code_book.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obs : ndarray
|
||||
Expects a rank 2 array. Each row is one observation.
|
||||
code_book : ndarray
|
||||
Code book to use. Same format than obs. Should have same number of
|
||||
features (e.g., columns) than obs.
|
||||
check_finite : bool, optional
|
||||
Whether to check that the input matrices contain only finite numbers.
|
||||
Disabling may give a performance gain, but may result in problems
|
||||
(crashes, non-termination) if the inputs do contain infinities or NaNs.
|
||||
Default: True
|
||||
|
||||
Returns
|
||||
-------
|
||||
code : ndarray
|
||||
code[i] gives the label of the ith obversation; its code is
|
||||
code_book[code[i]].
|
||||
mind_dist : ndarray
|
||||
min_dist[i] gives the distance between the ith observation and its
|
||||
corresponding code.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This function is slower than the C version but works for
|
||||
all input types. If the inputs have the wrong types for the
|
||||
C versions of the function, this one is called as a last resort.
|
||||
|
||||
It is about 20 times slower than the C version.
|
||||
|
||||
"""
|
||||
xp = array_namespace(obs, code_book)
|
||||
obs = _asarray(obs, xp=xp, check_finite=check_finite)
|
||||
code_book = _asarray(code_book, xp=xp, check_finite=check_finite)
|
||||
|
||||
if obs.ndim != code_book.ndim:
|
||||
raise ValueError("Observation and code_book should have the same rank")
|
||||
|
||||
if obs.ndim == 1:
|
||||
obs = obs[:, xp.newaxis]
|
||||
code_book = code_book[:, xp.newaxis]
|
||||
|
||||
# Once `cdist` has array API support, this `xp.asarray` call can be removed
|
||||
dist = xp.asarray(cdist(obs, code_book))
|
||||
code = xp.argmin(dist, axis=1)
|
||||
min_dist = xp.min(dist, axis=1)
|
||||
return code, min_dist
|
||||
|
||||
|
||||
def _kmeans(obs, guess, thresh=1e-5, xp=None):
|
||||
""" "raw" version of k-means.
|
||||
|
||||
Returns
|
||||
-------
|
||||
code_book
|
||||
The lowest distortion codebook found.
|
||||
avg_dist
|
||||
The average distance a observation is from a code in the book.
|
||||
Lower means the code_book matches the data better.
|
||||
|
||||
See Also
|
||||
--------
|
||||
kmeans : wrapper around k-means
|
||||
|
||||
Examples
|
||||
--------
|
||||
Note: not whitened in this example.
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from scipy.cluster.vq import _kmeans
|
||||
>>> features = np.array([[ 1.9,2.3],
|
||||
... [ 1.5,2.5],
|
||||
... [ 0.8,0.6],
|
||||
... [ 0.4,1.8],
|
||||
... [ 1.0,1.0]])
|
||||
>>> book = np.array((features[0],features[2]))
|
||||
>>> _kmeans(features,book)
|
||||
(array([[ 1.7 , 2.4 ],
|
||||
[ 0.73333333, 1.13333333]]), 0.40563916697728591)
|
||||
|
||||
"""
|
||||
xp = np if xp is None else xp
|
||||
code_book = guess
|
||||
diff = xp.inf
|
||||
prev_avg_dists = deque([diff], maxlen=2)
|
||||
while diff > thresh:
|
||||
# compute membership and distances between obs and code_book
|
||||
obs_code, distort = vq(obs, code_book, check_finite=False)
|
||||
prev_avg_dists.append(xp.mean(distort, axis=-1))
|
||||
# recalc code_book as centroids of associated obs
|
||||
obs = np.asarray(obs)
|
||||
obs_code = np.asarray(obs_code)
|
||||
code_book, has_members = _vq.update_cluster_means(obs, obs_code,
|
||||
code_book.shape[0])
|
||||
obs = xp.asarray(obs)
|
||||
obs_code = xp.asarray(obs_code)
|
||||
code_book = xp.asarray(code_book)
|
||||
has_members = xp.asarray(has_members)
|
||||
code_book = code_book[has_members]
|
||||
diff = xp.abs(prev_avg_dists[0] - prev_avg_dists[1])
|
||||
|
||||
return code_book, prev_avg_dists[1]
|
||||
|
||||
|
||||
def kmeans(obs, k_or_guess, iter=20, thresh=1e-5, check_finite=True,
|
||||
*, seed=None):
|
||||
"""
|
||||
Performs k-means on a set of observation vectors forming k clusters.
|
||||
|
||||
The k-means algorithm adjusts the classification of the observations
|
||||
into clusters and updates the cluster centroids until the position of
|
||||
the centroids is stable over successive iterations. In this
|
||||
implementation of the algorithm, the stability of the centroids is
|
||||
determined by comparing the absolute value of the change in the average
|
||||
Euclidean distance between the observations and their corresponding
|
||||
centroids against a threshold. This yields
|
||||
a code book mapping centroids to codes and vice versa.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
obs : ndarray
|
||||
Each row of the M by N array is an observation vector. The
|
||||
columns are the features seen during each observation.
|
||||
The features must be whitened first with the `whiten` function.
|
||||
|
||||
k_or_guess : int or ndarray
|
||||
The number of centroids to generate. A code is assigned to
|
||||
each centroid, which is also the row index of the centroid
|
||||
in the code_book matrix generated.
|
||||
|
||||
The initial k centroids are chosen by randomly selecting
|
||||
observations from the observation matrix. Alternatively,
|
||||
passing a k by N array specifies the initial k centroids.
|
||||
|
||||
iter : int, optional
|
||||
The number of times to run k-means, returning the codebook
|
||||
with the lowest distortion. This argument is ignored if
|
||||
initial centroids are specified with an array for the
|
||||
``k_or_guess`` parameter. This parameter does not represent the
|
||||
number of iterations of the k-means algorithm.
|
||||
|
||||
thresh : float, optional
|
||||
Terminates the k-means algorithm if the change in
|
||||
distortion since the last k-means iteration is less than
|
||||
or equal to threshold.
|
||||
|
||||
check_finite : bool, optional
|
||||
Whether to check that the input matrices contain only finite numbers.
|
||||
Disabling may give a performance gain, but may result in problems
|
||||
(crashes, non-termination) if the inputs do contain infinities or NaNs.
|
||||
Default: True
|
||||
|
||||
seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
|
||||
Seed for initializing the pseudo-random number generator.
|
||||
If `seed` is None (or `numpy.random`), the `numpy.random.RandomState`
|
||||
singleton is used.
|
||||
If `seed` is an int, a new ``RandomState`` instance is used,
|
||||
seeded with `seed`.
|
||||
If `seed` is already a ``Generator`` or ``RandomState`` instance then
|
||||
that instance is used.
|
||||
The default is None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
codebook : ndarray
|
||||
A k by N array of k centroids. The ith centroid
|
||||
codebook[i] is represented with the code i. The centroids
|
||||
and codes generated represent the lowest distortion seen,
|
||||
not necessarily the globally minimal distortion.
|
||||
Note that the number of centroids is not necessarily the same as the
|
||||
``k_or_guess`` parameter, because centroids assigned to no observations
|
||||
are removed during iterations.
|
||||
|
||||
distortion : float
|
||||
The mean (non-squared) Euclidean distance between the observations
|
||||
passed and the centroids generated. Note the difference to the standard
|
||||
definition of distortion in the context of the k-means algorithm, which
|
||||
is the sum of the squared distances.
|
||||
|
||||
See Also
|
||||
--------
|
||||
kmeans2 : a different implementation of k-means clustering
|
||||
with more methods for generating initial centroids but without
|
||||
using a distortion change threshold as a stopping criterion.
|
||||
|
||||
whiten : must be called prior to passing an observation matrix
|
||||
to kmeans.
|
||||
|
||||
Notes
|
||||
-----
|
||||
For more functionalities or optimal performance, you can use
|
||||
`sklearn.cluster.KMeans <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html>`_.
|
||||
`This <https://hdbscan.readthedocs.io/en/latest/performance_and_scalability.html#comparison-of-high-performance-implementations>`_
|
||||
is a benchmark result of several implementations.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from scipy.cluster.vq import vq, kmeans, whiten
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> features = np.array([[ 1.9,2.3],
|
||||
... [ 1.5,2.5],
|
||||
... [ 0.8,0.6],
|
||||
... [ 0.4,1.8],
|
||||
... [ 0.1,0.1],
|
||||
... [ 0.2,1.8],
|
||||
... [ 2.0,0.5],
|
||||
... [ 0.3,1.5],
|
||||
... [ 1.0,1.0]])
|
||||
>>> whitened = whiten(features)
|
||||
>>> book = np.array((whitened[0],whitened[2]))
|
||||
>>> kmeans(whitened,book)
|
||||
(array([[ 2.3110306 , 2.86287398], # random
|
||||
[ 0.93218041, 1.24398691]]), 0.85684700941625547)
|
||||
|
||||
>>> codes = 3
|
||||
>>> kmeans(whitened,codes)
|
||||
(array([[ 2.3110306 , 2.86287398], # random
|
||||
[ 1.32544402, 0.65607529],
|
||||
[ 0.40782893, 2.02786907]]), 0.5196582527686241)
|
||||
|
||||
>>> # Create 50 datapoints in two clusters a and b
|
||||
>>> pts = 50
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> a = rng.multivariate_normal([0, 0], [[4, 1], [1, 4]], size=pts)
|
||||
>>> b = rng.multivariate_normal([30, 10],
|
||||
... [[10, 2], [2, 1]],
|
||||
... size=pts)
|
||||
>>> features = np.concatenate((a, b))
|
||||
>>> # Whiten data
|
||||
>>> whitened = whiten(features)
|
||||
>>> # Find 2 clusters in the data
|
||||
>>> codebook, distortion = kmeans(whitened, 2)
|
||||
>>> # Plot whitened data and cluster centers in red
|
||||
>>> plt.scatter(whitened[:, 0], whitened[:, 1])
|
||||
>>> plt.scatter(codebook[:, 0], codebook[:, 1], c='r')
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
if isinstance(k_or_guess, int):
|
||||
xp = array_namespace(obs)
|
||||
else:
|
||||
xp = array_namespace(obs, k_or_guess)
|
||||
obs = _asarray(obs, xp=xp, check_finite=check_finite)
|
||||
guess = _asarray(k_or_guess, xp=xp, check_finite=check_finite)
|
||||
if iter < 1:
|
||||
raise ValueError("iter must be at least 1, got %s" % iter)
|
||||
|
||||
# Determine whether a count (scalar) or an initial guess (array) was passed.
|
||||
if size(guess) != 1:
|
||||
if size(guess) < 1:
|
||||
raise ValueError("Asked for 0 clusters. Initial book was %s" %
|
||||
guess)
|
||||
return _kmeans(obs, guess, thresh=thresh, xp=xp)
|
||||
|
||||
# k_or_guess is a scalar, now verify that it's an integer
|
||||
k = int(guess)
|
||||
if k != guess:
|
||||
raise ValueError("If k_or_guess is a scalar, it must be an integer.")
|
||||
if k < 1:
|
||||
raise ValueError("Asked for %d clusters." % k)
|
||||
|
||||
rng = check_random_state(seed)
|
||||
|
||||
# initialize best distance value to a large value
|
||||
best_dist = xp.inf
|
||||
for i in range(iter):
|
||||
# the initial code book is randomly selected from observations
|
||||
guess = _kpoints(obs, k, rng, xp)
|
||||
book, dist = _kmeans(obs, guess, thresh=thresh, xp=xp)
|
||||
if dist < best_dist:
|
||||
best_book = book
|
||||
best_dist = dist
|
||||
return best_book, best_dist
|
||||
|
||||
|
||||
def _kpoints(data, k, rng, xp):
|
||||
"""Pick k points at random in data (one row = one observation).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ndarray
|
||||
Expect a rank 1 or 2 array. Rank 1 are assumed to describe one
|
||||
dimensional data, rank 2 multidimensional data, in which case one
|
||||
row is one observation.
|
||||
k : int
|
||||
Number of samples to generate.
|
||||
rng : `numpy.random.Generator` or `numpy.random.RandomState`
|
||||
Random number generator.
|
||||
|
||||
Returns
|
||||
-------
|
||||
x : ndarray
|
||||
A 'k' by 'N' containing the initial centroids
|
||||
|
||||
"""
|
||||
idx = rng.choice(data.shape[0], size=int(k), replace=False)
|
||||
# convert to array with default integer dtype (avoids numpy#25607)
|
||||
idx = xp.asarray(idx, dtype=xp.asarray([1]).dtype)
|
||||
return xp.take(data, idx, axis=0)
|
||||
|
||||
|
||||
def _krandinit(data, k, rng, xp):
|
||||
"""Returns k samples of a random variable whose parameters depend on data.
|
||||
|
||||
More precisely, it returns k observations sampled from a Gaussian random
|
||||
variable whose mean and covariances are the ones estimated from the data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ndarray
|
||||
Expect a rank 1 or 2 array. Rank 1 is assumed to describe 1-D
|
||||
data, rank 2 multidimensional data, in which case one
|
||||
row is one observation.
|
||||
k : int
|
||||
Number of samples to generate.
|
||||
rng : `numpy.random.Generator` or `numpy.random.RandomState`
|
||||
Random number generator.
|
||||
|
||||
Returns
|
||||
-------
|
||||
x : ndarray
|
||||
A 'k' by 'N' containing the initial centroids
|
||||
|
||||
"""
|
||||
mu = xp.mean(data, axis=0)
|
||||
k = np.asarray(k)
|
||||
|
||||
if data.ndim == 1:
|
||||
_cov = cov(data)
|
||||
x = rng.standard_normal(size=k)
|
||||
x = xp.asarray(x)
|
||||
x *= xp.sqrt(_cov)
|
||||
elif data.shape[1] > data.shape[0]:
|
||||
# initialize when the covariance matrix is rank deficient
|
||||
_, s, vh = xp.linalg.svd(data - mu, full_matrices=False)
|
||||
x = rng.standard_normal(size=(k, size(s)))
|
||||
x = xp.asarray(x)
|
||||
sVh = s[:, None] * vh / xp.sqrt(data.shape[0] - xp.asarray(1.))
|
||||
x = x @ sVh
|
||||
else:
|
||||
_cov = atleast_nd(cov(data.T), ndim=2)
|
||||
|
||||
# k rows, d cols (one row = one obs)
|
||||
# Generate k sample of a random variable ~ Gaussian(mu, cov)
|
||||
x = rng.standard_normal(size=(k, size(mu)))
|
||||
x = xp.asarray(x)
|
||||
x = x @ xp.linalg.cholesky(_cov).T
|
||||
|
||||
x += mu
|
||||
return x
|
||||
|
||||
|
||||
def _kpp(data, k, rng, xp):
|
||||
""" Picks k points in the data based on the kmeans++ method.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ndarray
|
||||
Expect a rank 1 or 2 array. Rank 1 is assumed to describe 1-D
|
||||
data, rank 2 multidimensional data, in which case one
|
||||
row is one observation.
|
||||
k : int
|
||||
Number of samples to generate.
|
||||
rng : `numpy.random.Generator` or `numpy.random.RandomState`
|
||||
Random number generator.
|
||||
|
||||
Returns
|
||||
-------
|
||||
init : ndarray
|
||||
A 'k' by 'N' containing the initial centroids.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] D. Arthur and S. Vassilvitskii, "k-means++: the advantages of
|
||||
careful seeding", Proceedings of the Eighteenth Annual ACM-SIAM Symposium
|
||||
on Discrete Algorithms, 2007.
|
||||
"""
|
||||
|
||||
ndim = len(data.shape)
|
||||
if ndim == 1:
|
||||
data = data[:, None]
|
||||
|
||||
dims = data.shape[1]
|
||||
|
||||
init = xp.empty((int(k), dims))
|
||||
|
||||
for i in range(k):
|
||||
if i == 0:
|
||||
init[i, :] = data[rng_integers(rng, data.shape[0]), :]
|
||||
|
||||
else:
|
||||
D2 = cdist(init[:i,:], data, metric='sqeuclidean').min(axis=0)
|
||||
probs = D2/D2.sum()
|
||||
cumprobs = probs.cumsum()
|
||||
r = rng.uniform()
|
||||
cumprobs = np.asarray(cumprobs)
|
||||
init[i, :] = data[np.searchsorted(cumprobs, r), :]
|
||||
|
||||
if ndim == 1:
|
||||
init = init[:, 0]
|
||||
return init
|
||||
|
||||
|
||||
_valid_init_meth = {'random': _krandinit, 'points': _kpoints, '++': _kpp}
|
||||
|
||||
|
||||
def _missing_warn():
|
||||
"""Print a warning when called."""
|
||||
warnings.warn("One of the clusters is empty. "
|
||||
"Re-run kmeans with a different initialization.",
|
||||
stacklevel=3)
|
||||
|
||||
|
||||
def _missing_raise():
|
||||
"""Raise a ClusterError when called."""
|
||||
raise ClusterError("One of the clusters is empty. "
|
||||
"Re-run kmeans with a different initialization.")
|
||||
|
||||
|
||||
_valid_miss_meth = {'warn': _missing_warn, 'raise': _missing_raise}
|
||||
|
||||
|
||||
def kmeans2(data, k, iter=10, thresh=1e-5, minit='random',
|
||||
missing='warn', check_finite=True, *, seed=None):
|
||||
"""
|
||||
Classify a set of observations into k clusters using the k-means algorithm.
|
||||
|
||||
The algorithm attempts to minimize the Euclidean distance between
|
||||
observations and centroids. Several initialization methods are
|
||||
included.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ndarray
|
||||
A 'M' by 'N' array of 'M' observations in 'N' dimensions or a length
|
||||
'M' array of 'M' 1-D observations.
|
||||
k : int or ndarray
|
||||
The number of clusters to form as well as the number of
|
||||
centroids to generate. If `minit` initialization string is
|
||||
'matrix', or if a ndarray is given instead, it is
|
||||
interpreted as initial cluster to use instead.
|
||||
iter : int, optional
|
||||
Number of iterations of the k-means algorithm to run. Note
|
||||
that this differs in meaning from the iters parameter to
|
||||
the kmeans function.
|
||||
thresh : float, optional
|
||||
(not used yet)
|
||||
minit : str, optional
|
||||
Method for initialization. Available methods are 'random',
|
||||
'points', '++' and 'matrix':
|
||||
|
||||
'random': generate k centroids from a Gaussian with mean and
|
||||
variance estimated from the data.
|
||||
|
||||
'points': choose k observations (rows) at random from data for
|
||||
the initial centroids.
|
||||
|
||||
'++': choose k observations accordingly to the kmeans++ method
|
||||
(careful seeding)
|
||||
|
||||
'matrix': interpret the k parameter as a k by M (or length k
|
||||
array for 1-D data) array of initial centroids.
|
||||
missing : str, optional
|
||||
Method to deal with empty clusters. Available methods are
|
||||
'warn' and 'raise':
|
||||
|
||||
'warn': give a warning and continue.
|
||||
|
||||
'raise': raise an ClusterError and terminate the algorithm.
|
||||
check_finite : bool, optional
|
||||
Whether to check that the input matrices contain only finite numbers.
|
||||
Disabling may give a performance gain, but may result in problems
|
||||
(crashes, non-termination) if the inputs do contain infinities or NaNs.
|
||||
Default: True
|
||||
seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
|
||||
Seed for initializing the pseudo-random number generator.
|
||||
If `seed` is None (or `numpy.random`), the `numpy.random.RandomState`
|
||||
singleton is used.
|
||||
If `seed` is an int, a new ``RandomState`` instance is used,
|
||||
seeded with `seed`.
|
||||
If `seed` is already a ``Generator`` or ``RandomState`` instance then
|
||||
that instance is used.
|
||||
The default is None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
centroid : ndarray
|
||||
A 'k' by 'N' array of centroids found at the last iteration of
|
||||
k-means.
|
||||
label : ndarray
|
||||
label[i] is the code or index of the centroid the
|
||||
ith observation is closest to.
|
||||
|
||||
See Also
|
||||
--------
|
||||
kmeans
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] D. Arthur and S. Vassilvitskii, "k-means++: the advantages of
|
||||
careful seeding", Proceedings of the Eighteenth Annual ACM-SIAM Symposium
|
||||
on Discrete Algorithms, 2007.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.cluster.vq import kmeans2
|
||||
>>> import matplotlib.pyplot as plt
|
||||
>>> import numpy as np
|
||||
|
||||
Create z, an array with shape (100, 2) containing a mixture of samples
|
||||
from three multivariate normal distributions.
|
||||
|
||||
>>> rng = np.random.default_rng()
|
||||
>>> a = rng.multivariate_normal([0, 6], [[2, 1], [1, 1.5]], size=45)
|
||||
>>> b = rng.multivariate_normal([2, 0], [[1, -1], [-1, 3]], size=30)
|
||||
>>> c = rng.multivariate_normal([6, 4], [[5, 0], [0, 1.2]], size=25)
|
||||
>>> z = np.concatenate((a, b, c))
|
||||
>>> rng.shuffle(z)
|
||||
|
||||
Compute three clusters.
|
||||
|
||||
>>> centroid, label = kmeans2(z, 3, minit='points')
|
||||
>>> centroid
|
||||
array([[ 2.22274463, -0.61666946], # may vary
|
||||
[ 0.54069047, 5.86541444],
|
||||
[ 6.73846769, 4.01991898]])
|
||||
|
||||
How many points are in each cluster?
|
||||
|
||||
>>> counts = np.bincount(label)
|
||||
>>> counts
|
||||
array([29, 51, 20]) # may vary
|
||||
|
||||
Plot the clusters.
|
||||
|
||||
>>> w0 = z[label == 0]
|
||||
>>> w1 = z[label == 1]
|
||||
>>> w2 = z[label == 2]
|
||||
>>> plt.plot(w0[:, 0], w0[:, 1], 'o', alpha=0.5, label='cluster 0')
|
||||
>>> plt.plot(w1[:, 0], w1[:, 1], 'd', alpha=0.5, label='cluster 1')
|
||||
>>> plt.plot(w2[:, 0], w2[:, 1], 's', alpha=0.5, label='cluster 2')
|
||||
>>> plt.plot(centroid[:, 0], centroid[:, 1], 'k*', label='centroids')
|
||||
>>> plt.axis('equal')
|
||||
>>> plt.legend(shadow=True)
|
||||
>>> plt.show()
|
||||
|
||||
"""
|
||||
if int(iter) < 1:
|
||||
raise ValueError("Invalid iter (%s), "
|
||||
"must be a positive integer." % iter)
|
||||
try:
|
||||
miss_meth = _valid_miss_meth[missing]
|
||||
except KeyError as e:
|
||||
raise ValueError(f"Unknown missing method {missing!r}") from e
|
||||
|
||||
if isinstance(k, int):
|
||||
xp = array_namespace(data)
|
||||
else:
|
||||
xp = array_namespace(data, k)
|
||||
data = _asarray(data, xp=xp, check_finite=check_finite)
|
||||
code_book = copy(k, xp=xp)
|
||||
if data.ndim == 1:
|
||||
d = 1
|
||||
elif data.ndim == 2:
|
||||
d = data.shape[1]
|
||||
else:
|
||||
raise ValueError("Input of rank > 2 is not supported.")
|
||||
|
||||
if size(data) < 1 or size(code_book) < 1:
|
||||
raise ValueError("Empty input is not supported.")
|
||||
|
||||
# If k is not a single value, it should be compatible with data's shape
|
||||
if minit == 'matrix' or size(code_book) > 1:
|
||||
if data.ndim != code_book.ndim:
|
||||
raise ValueError("k array doesn't match data rank")
|
||||
nc = code_book.shape[0]
|
||||
if data.ndim > 1 and code_book.shape[1] != d:
|
||||
raise ValueError("k array doesn't match data dimension")
|
||||
else:
|
||||
nc = int(code_book)
|
||||
|
||||
if nc < 1:
|
||||
raise ValueError("Cannot ask kmeans2 for %d clusters"
|
||||
" (k was %s)" % (nc, code_book))
|
||||
elif nc != code_book:
|
||||
warnings.warn("k was not an integer, was converted.", stacklevel=2)
|
||||
|
||||
try:
|
||||
init_meth = _valid_init_meth[minit]
|
||||
except KeyError as e:
|
||||
raise ValueError(f"Unknown init method {minit!r}") from e
|
||||
else:
|
||||
rng = check_random_state(seed)
|
||||
code_book = init_meth(data, code_book, rng, xp)
|
||||
|
||||
data = np.asarray(data)
|
||||
code_book = np.asarray(code_book)
|
||||
for i in range(iter):
|
||||
# Compute the nearest neighbor for each obs using the current code book
|
||||
label = vq(data, code_book, check_finite=check_finite)[0]
|
||||
# Update the code book by computing centroids
|
||||
new_code_book, has_members = _vq.update_cluster_means(data, label, nc)
|
||||
if not has_members.all():
|
||||
miss_meth()
|
||||
# Set the empty clusters to their previous positions
|
||||
new_code_book[~has_members] = code_book[~has_members]
|
||||
code_book = new_code_book
|
||||
|
||||
return xp.asarray(code_book), xp.asarray(label)
|
||||
Reference in New Issue
Block a user