data = pd.read_csv('h3n2-snp.csv')

print(data.iloc[:5, :10])

     strain  s6a  s6c  s6g  s17a  s17g  s17t  s39a  s39c  s39g
0  AB434107  1.0  0.0  0.0   1.0   0.0   0.0   0.0   0.0   1.0
1  AB434108  1.0  0.0  0.0   1.0   0.0   0.0   0.0   0.0   1.0
2  CY000113  1.0  0.0  0.0   1.0   0.0   0.0   0.0   0.0   1.0
3  CY000209  1.0  0.0  0.0   1.0   0.0   0.0   0.0   0.0   1.0
4  CY000217  1.0  0.0  0.0   1.0   0.0   0.0   0.0   0.0   1.0

data.shape

(1642, 318)

w1 = np.array([1., 0., 1.])
w2 = np.array([0., 1., 1.])
w3 = np.array([1., -1., 0.])
A = np.stack((w1, w2, w3), axis=-1)
print(A)

[[ 1.  0.  1.]
 [ 0.  1. -1.]
 [ 1.  1.  0.]]

LA.matrix_rank(A)

2

B = np.stack((w1, w2),axis=-1)
print(B)

[[1. 0.]
 [0. 1.]
 [1. 1.]]

LA.matrix_rank(B)

2

C = np.array([[1., 0., 1.],[0., 1., -1.]])
print(C)

[[ 1.  0.  1.]
 [ 0.  1. -1.]]

LA.matrix_rank(C)

2

print(B @ C)

[[ 1.  0.  1.]
 [ 0.  1. -1.]
 [ 1.  1.  0.]]

A = np.array([[2.5, -0.5], [-0.5, 2.5]])
w, v = LA.eig(A)
print(w)
print(v)

[3. 2.]
[[ 0.70710678  0.70710678]
 [-0.70710678  0.70710678]]

A = np.array([[1, -1], [-1, 1]])
w, v = LA.eig(A)
print(w)

[2. 0.]

B = np.array([[1, -2], [-2, 1]])
z, u = LA.eig(B)
print(z)

[ 3. -1.]

def topsing(rng, A, maxiter=10):
    x = rng.normal(0,1,np.shape(A)[1])
    B = A.T @ A
    for _ in range(maxiter):
        x = B @ x
    v = x / LA.norm(x)
    s = LA.norm(A @ v)
    u = A @ v / s
    return u, s, v

seed = 42
rng = np.random.default_rng(seed)
d, n, w = 10, 100, 3.
X = mmids.two_mixed_clusters(rng, d, n, w)
plt.figure(figsize=(6,3))
plt.scatter(X[:,0], X[:,1], s=10, c='k')
plt.axis([-6,6,-3,3])
plt.show()

u, s, v = topsing(rng, X)
print(v)

[ 0.99257882  0.10164805  0.01581003  0.03202184  0.02075852  0.02798115
 -0.02920916 -0.028189   -0.0166094  -0.00648726]

u, s, vh = LA.svd(X)
print(vh.T[:,0])

[ 0.99257882  0.10164803  0.01581003  0.03202184  0.02075851  0.02798112
 -0.02920917 -0.028189   -0.01660938 -0.00648724]

d, n, w = 1000, 100, 3.
X = mmids.two_mixed_clusters(rng, d, n, w)

assign = mmids.kmeans(rng, X, 2)

99423.42794703908
99423.42794703908
99423.42794703908
99423.42794703908
99423.42794703908

plt.figure(figsize=(6,3))
plt.scatter(X[:,0], X[:,1], s=10, c=assign, cmap='brg')
plt.axis([-6,6,-3,3])
plt.show()

u, s, v = topsing(rng, X)
Xproj = np.stack((u*s, np.zeros(np.shape(X)[0])), axis=-1)
fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
ax.scatter(Xproj[:,0], Xproj[:,1], s=10, c='b', alpha=0.25)
plt.ylim([-3,3])
plt.show()

assign = mmids.kmeans(rng, Xproj, 2)

1779.020119584778
514.1899426112672
514.1899426112672
514.1899426112672
514.1899426112672

plt.figure(figsize=(6,3))
plt.scatter(X[:,0], X[:,1], s=10, c=assign, cmap='brg')
plt.axis([-6,6,-3,3])
plt.show()

print(v[:10])

[-0.55564563 -0.02433674  0.02193487 -0.0333936  -0.00445505 -0.00243003
  0.02576056  0.02523275 -0.00682153  0.02524646]

def pca(X, l):
    mean = np.mean(X, axis=0)
    Y = X - mean
    U, S, Vt = LA.svd(Y, full_matrices=False)
    return U[:, :l] @ np.diag(S[:l])

seed = 535
rng = np.random.default_rng(seed)
d, n, w = 1000, 100, 3.
X = mmids.two_mixed_clusters(rng, d, n, w)
T = pca(X, 2)

fig = plt.figure()
ax = fig.add_subplot(111,aspect='equal')
ax.scatter(T[:,0], T[:,1], s=5, c='k')
plt.show()

data = pd.read_csv('h3n2-snp.csv')

A = data[[data.columns[i] for i in range(1,len(data.columns))]].to_numpy()
n_dims = 10
T = pca(A, n_dims)

plt.figure(figsize=(5,3))
plt.scatter(T[:,0], T[:,1], s=10, c='k')
plt.axis([-3,6,-3,3])
plt.show()

from sklearn.cluster import KMeans

n_clusters = 8
kmeans = KMeans(n_clusters=n_clusters, init='k-means++', 
                random_state=seed, n_init=10).fit(T)
assign = kmeans.labels_

data_oth = pd.read_csv('h3n2-other.csv')
data_oth.head()

year = data_oth['year'].to_numpy()

fig, ax = plt.subplots(figsize=(6,4))

for i in range(n_clusters):
    unique, counts = np.unique(year[assign == i], return_counts=True)
    ax.bar(unique, counts, label=i)

ax.set(xlim=(2001, 2007), xticks=np.arange(2002, 2007))
ax.legend()
plt.show()

fig = plt.figure(figsize=(5,3))
ax = fig.add_subplot(111, aspect='equal')
scatter = ax.scatter(T[:,0], T[:,1], s=10,  c=year, label=year)
plt.legend(*scatter.legend_elements())
plt.show()

corr = np.zeros(n_dims)
for i in range(n_dims):
    corr[i] = np.corrcoef(np.stack((T[:,i], year)))[0,1]

print(corr)

[-0.7905001  -0.42806325  0.0870437  -0.16839491  0.05757342 -0.06046913
 -0.07920042  0.01436618 -0.02544749  0.04314641]

A = np.array([[1., 0.],[0., 1.],[0., 0.]])
print(A)

[[1. 0.]
 [0. 1.]
 [0. 0.]]

LA.norm(A)

1.4142135623730951

LA.norm(A, 2)

1.0

def two_separate_clusters(rng, d, n, w):
    
    mu0 = np.concatenate(([w], np.zeros(d-1)))
    mu1 = np.concatenate(([-w], np.zeros(d-1)))
    
    X0 = mmids.spherical_gaussian(rng, d, n, mu0, 1)
    X1 = mmids.spherical_gaussian(rng, d, n, mu1, 1)
    
    return X0, X1

seed = 535
rng = np.random.default_rng(seed)
d, n, w = 1000, 100, 3.
X1, X2 = two_separate_clusters(rng, d, n, w)
X = np.vstack((X1, X2))

C1 = np.stack([np.concatenate(([-w], np.zeros(d-1))) for _ in range(n)])
C2 = np.stack([np.concatenate(([w], np.zeros(d-1))) for _ in range(n)])
C = np.vstack((C1, C2))

uc, sc, vhc = LA.svd(X-C)
plt.plot(sc, c='k')
plt.show()

print(np.sum(sc**2))

207905.47916782406

print(sc[0]**2)

8258.19604762502

u, s, vh = LA.svd(X)
print(np.sum((s[0] * np.outer(u[:,0],vh[:,0]) - C)**2))

8099.057045408984

	strain	length	country	year	lon	lat	date
0	AB434107	1701	Japan	2002	137.215474	35.584176	2002/02/25
1	AB434108	1701	Japan	2002	137.215474	35.584176	2002/03/01
2	CY000113	1762	USA	2002	-73.940000	40.670000	2002/01/29
3	CY000209	1760	USA	2002	-73.940000	40.670000	2002/01/17
4	CY000217	1760	USA	2002	-73.940000	40.670000	2002/02/26

Motivating example: visualizing viral evolution¶

Background: review of matrix rank and spectral decomposition¶

Power iteration¶

Application: principal components analysis¶

Further applications of the SVD: low-rank approximations and ridge regression¶