import pandas as pd
data = pd.read_csv('penguins-measurements.csv')
data.head()

data = data.dropna()
data.head()

data.shape

(342, 4)

X = data.to_numpy()
print(X)

[[  39.1   18.7  181.  3750. ]
 [  39.5   17.4  186.  3800. ]
 [  40.3   18.   195.  3250. ]
 ...
 [  50.4   15.7  222.  5750. ]
 [  45.2   14.8  212.  5200. ]
 [  49.9   16.1  213.  5400. ]]

import matplotlib.pyplot as plt
plt.scatter(X[:,1], X[:,2], s=5, c='k')
plt.xlabel('bill_depth_mm'), plt.ylabel('flipper_length_mm')
plt.show()

import seaborn as sns
sns.pairplot(data, height=2)
plt.show()

import numpy as np
u = np.array([1., 3., 5. ,7.])
print(u)

[1. 3. 5. 7.]

print(u[0])
print(u[1])

1.0
3.0

from numpy import linalg as LA
LA.norm(u)

9.16515138991168

np.sqrt(np.sum(u ** 2))

9.16515138991168

u = np.array([1., 3., 5., 7.])
v = np.array([2., 4., 6., 8.])
X = np.stack((u,v),axis=0)
print(X)

[[1. 3. 5. 7.]
 [2. 4. 6. 8.]]

Y = np.array([[1., 3., 5., 7.],[2., 4., 6., 8.]])
print(Y)

[[1. 3. 5. 7.]
 [2. 4. 6. 8.]]

print(Y[0,0])
print(Y[0,1])

1.0
3.0

A = np.array([[1., 0.],[0., 1.],[0., 0.]])
print(A)

[[1. 0.]
 [0. 1.]
 [0. 0.]]

LA.norm(A)

1.4142135623730951

import matplotlib.pyplot as plt

x = np.linspace(-2,2,100)
y = x ** 2

plt.plot(x, y, c='k')
plt.ylim(-0.25,4.25)
plt.show()

x = np.linspace(-2,2,100)
y = np.exp(x)

plt.plot(x, y, c='k')
plt.ylim(-0.25,4.25)
plt.show()

x = np.linspace(-2,2,100)
y = ((x+1)**2) * ((x-1)**2)

plt.plot(x,y,c='k')
plt.ylim(-0.25,4.25)
plt.show()

seed = 535
rng = np.random.default_rng(seed)
rng.uniform()

0.9836159914889122

n, k = 10, 1000
sample_mean = [np.mean(rng.random(n)) for i in range(k)]
plt.hist(sample_mean, bins=10, color='lightblue', edgecolor='black')
plt.xlim(0,1)
plt.show()

n, k = 100, 1000
sample_mean = [np.mean(rng.random(n)) for i in range(k)]
plt.hist(sample_mean, bins=10, color='lightblue', edgecolor='black')
plt.xlim(0,1)
plt.show()

def spherical_gaussian(rng, d, n, mu, sig):
    return mu + sig * rng.normal(0,1,(n,d))

d, n, w, sig = 2, 100, 3., 1.
mu = np.hstack(([w], np.zeros(d-1)))
X = spherical_gaussian(rng, d, n, mu, sig)
plt.scatter(X[:,0], X[:,1], s=5, c='k')
plt.axis([-1, 7, -4, 4])
plt.show()

def gmm2spherical(rng, d, n, phi0, phi1, mu0, sig0, mu1, sig1):
    
    phi, mu, sig = np.stack((phi0, phi1)), np.stack((mu0, mu1)), np.stack((sig0,sig1))

    X = np.zeros((n,d))
    component = rng.choice(2, size=n, p=phi)
    for i in range(n):
        X[i,:] = spherical_gaussian(rng, d, 1, mu[component[i],:], sig[component[i]])
    
    return X

d, n, w, sig0, sig1, phi0, phi1 = 2, 1000, 3., 1.5, 0.5, 0.2, 0.8
mu0, mu1 = np.hstack(([w], np.zeros(d-1))), np.hstack(([-w], np.zeros(d-1)))
X = gmm2spherical(rng, d, n, phi0, phi1, mu0, sig0, mu1, sig1)
plt.figure(figsize=(6,3))
plt.scatter(X[:,0], X[:,1], s=5, color='k')
plt.axis([-8, 8, -4, 4])
plt.show()

def opt_reps(X, k, assign):
    (n, d) = X.shape
    reps = np.zeros((k, d))
    for i in range(k):
        in_i = [j for j in range(n) if assign[j] == i]             
        reps[i,:] = np.sum(X[in_i,:],axis=0) / len(in_i)
    return reps

def opt_clust(X, k, reps):
    (n, d) = X.shape
    dist = np.zeros(n)
    assign = np.zeros(n, dtype=int)
    for j in range(n):
        dist_to_i = np.array([LA.norm(X[j,:] - reps[i,:]) for i in range(k)])
        assign[j] = np.argmin(dist_to_i)
        dist[j] = dist_to_i[assign[j]]
    G = np.sum(dist ** 2)
    print(G)
    return assign

def kmeans(rng, X, k, maxiter=5):
    (n, d) = X.shape
    assign = rng.integers(0,k,n)
    reps = np.zeros((k, d), dtype=int)
    for iter in range(maxiter):
        reps = opt_reps(X, k, assign) 
        assign = opt_clust(X, k, reps) 
    return assign

seed = 535
rng = np.random.default_rng(seed)
X = np.array([[1., 0.],[-2., 0.],[-2.,1.],[1.,-3.],
              [-10.,10.],[2.,-2.],[-3.,1.],[3.,-1.]])
assign = kmeans(rng, X, 3)

162.7
74.8611111111111
9.083333333333334
9.083333333333334
9.083333333333334

plt.scatter(X[:,0], X[:,1], s=10, c=assign, cmap='brg')
plt.axis([-11,4,-4,11])
plt.show()

print(opt_reps(X, 3, assign))

[[ -2.33333333   0.66666667]
 [  1.75        -1.5       ]
 [-10.          10.        ]]

data = pd.read_csv('penguins-measurements.csv')
data = data.dropna()
X = data[['bill_length_mm', 'bill_depth_mm', 
        'flipper_length_mm', 'body_mass_g']].to_numpy()

plt.scatter(X[:,1], X[:,3], s=5, c='k')
plt.xlabel('bill_depth_mm'), plt.ylabel('body_mass_g')
plt.show()

mean = np.mean(X, axis=0)
std = np.std(X, axis=0)
X = (X - mean) / std

assign = kmeans(rng, X, 2)

1338.2046936914157
820.9361062178352
603.8787658966849
575.2587351391593
567.7837494880662

plt.scatter(X[:,1], X[:,3], s=5, c=assign, cmap='brg')
plt.xlabel('bill_depth (standardized)'), plt.ylabel('body_mass (standardized)')
plt.show()

assign = kmeans(rng, X, 3)

1312.344945158482
577.1700837839458
428.50397345437966
392.2616692426171
383.3452894259011

plt.scatter(X[:,1], X[:,3], s=5, c=assign, cmap='brg')
plt.xlabel('bill_depth (standardized)'), plt.ylabel('body_mass (standardized)')
plt.show()

plt.scatter(X[:,0], X[:,3], s=5, c=assign, cmap='brg')
plt.xlabel('bill_length (standardized)'), plt.ylabel('body_mass (standardized)')
plt.show()

data_truth = pd.read_csv('penguins-species.csv') 
data_truth = data_truth.iloc[data.index]
data_truth.head()

species = data_truth['species']
print(species.unique())

['Adelie' 'Chinstrap' 'Gentoo']

species2color_dict = {'Adelie': 'blue', 'Chinstrap': 'lime', 'Gentoo': 'red'}
truth = [species2color_dict[a] for a in species]

f, (ax1, ax2) = plt.subplots(1, 2, sharex=True, sharey=True, figsize=(6.5, 3))
ax1.scatter(X[:,0], X[:,3], s=5, c=truth)
ax1.set_title('truth')
ax2.scatter(X[:,0], X[:,3], s=5, c=assign, cmap='brg')
ax2.set_title('kmeans')
plt.show()

def two_mixed_clusters(rng, d, n, w):
    mu0 = np.hstack(([w], np.zeros(d-1)))
    mu1 = np.hstack(([-w], np.zeros(d-1)))
    return mmids.gmm2spherical(rng, d, n, 0.5, 0.5, mu0, 1, mu1, 1)

seed = 535
rng = np.random.default_rng(seed)
d, n, w = 2, 100, 3.
X = two_mixed_clusters(rng, d, n, w)

assign = mmids.kmeans(rng, X, 2)

1044.8267883490312
208.5284166285488
204.02397716710018
204.02397716710018
204.02397716710018

plt.figure(figsize=(6,3))
plt.scatter(X[:,0], X[:,1], s=10, c=assign, cmap='brg')
plt.axis([-6,6,-3,3])
plt.show()

d, n, w = 1000, 100, 3.
X = two_mixed_clusters(rng, d, n, w)

plt.figure(figsize=(6,3))
plt.scatter(X[:,0], X[:,1], s=10, c='k')
plt.axis([-6,6,-3,3])
plt.show()

plt.figure(figsize=(6,3))
plt.scatter(X[:,1], X[:,2], s=10, c='k')
plt.axis([-6,6,-3,3])
plt.show()

assign = mmids.kmeans(rng, X, 2)

99518.03165136592
99518.03165136592
99518.03165136592
99518.03165136592
99518.03165136592

plt.figure(figsize=(6,3))
plt.scatter(X[:,0], X[:,1], s=10, c=assign, cmap='brg')
plt.axis([-6,6,-3,3])
plt.show()

dmax, n = 10, 1000

in_ball = np.zeros(dmax)
for d in range(dmax):
    in_ball[d] = np.mean([(LA.norm(rng.random(d+1) - 1/2) < 1/2) for _ in range(n)])
    
plt.plot(np.arange(1,dmax+1), in_ball, c='k') 
plt.show()

	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g
0	39.1	18.7	181.0	3750.0
1	39.5	17.4	186.0	3800.0
2	40.3	18.0	195.0	3250.0
3	NaN	NaN	NaN	NaN
4	36.7	19.3	193.0	3450.0

	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g
0	39.1	18.7	181.0	3750.0
1	39.5	17.4	186.0	3800.0
2	40.3	18.0	195.0	3250.0
4	36.7	19.3	193.0	3450.0
5	39.3	20.6	190.0	3650.0

Motivating example: identifying penguin species¶

Background: quick refresher of matrix algebra, differential calculus, and elementary probability¶

Clustering: an objective, an algorithm and a guarantee¶

Lloyd's algorithm and its analysis¶

Some observations about high-dimensional data¶

Clustering in high dimension¶

	species
0	Adelie
1	Adelie
2	Adelie
4	Adelie
5	Adelie