from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from imageio import imread
from time import time as timer
import os
import tensorflow as tf
%matplotlib inline
from matplotlib import animation
from IPython.display import HTML
import umap
from scipy.stats import entropy
if not os.path.exists('data'):
path = os.path.abspath('.')+'/colab_material.tgz'
tf.keras.utils.get_file(path, 'https://github.com/neworldemancer/DSF5/raw/master/colab_material.tgz')
!tar -xvzf colab_material.tgz > /dev/null 2>&1
from utils.routines import *
Objective: clustering techniques divide the set of data into group of atoms having common features. Each data point $p$ gets assigned a label $l_p \in \{1,..,K\}$. In this presentation the data points are supposed to have $D$ features, i.e. each data point belongs to $\mathbf{R}^D$.
Methods: We call $P_k$ the subset of the data set which gets assigned to class $k$. K-means aims at minimizing the objective function: $$L=\sum_k L_k$$ $$L_k=\frac{1}{|P_k|}\sum_{p,p' \in L_k}|\mathbf{x}_p-\mathbf{x}_{p'}|^2$$ One could enumerate all possibilities. The Llyod algorithm is iterative:
The Lloyd algorithm finds local minima and may need to be started several times with different initializations.
Terminology and output of a K-means computation:
We start with a 2D example that can be visualized.
First we load the data-set.
points=km_load_th1()
Explore the data-set checking the dataset dimensionality.
print(points.shape)
print('We have ', points.shape[0], 'points with two features')
plt.plot(points[:,0],points[:,1],'o')
plt.xlabel('feature-1')
plt.ylabel('feature-2')
It looks visually that the data set has three clusters. We will cluster them using K-means. As usual, we create a KMeans object. Note that we do not need to initialize it with a data-set.
clusterer = KMeans(n_clusters=3, random_state=10)
A call to the fit method computes the cluster centers which can be plotted alongside the data-set. They are accessible from the clustercenters attribute:
clusterer.fit(points)
plt.plot(points[:,0],points[:,1],'o')
plt.plot(clusterer.cluster_centers_[:,0],clusterer.cluster_centers_[:,1],'o',markersize=10)
The predict method assigns a new point to the nearest cluster. We can use predict with the training dataset and color the data-set according to the cluster label.
cluster_labels=clusterer.predict(points)
plt.scatter(points[:,0],points[:,1],c=cluster_labels)
Finally, we can try to vary the number of clusters and score them with the Silhouette score.
sil=[]
for iclust in range(2,6):
clusterer = KMeans(n_clusters=iclust, random_state=10)
cluster_labels = clusterer.fit_predict(points)
score=silhouette_score(points,cluster_labels)
sil.append(score)
plt.figure()
plt.scatter(points[:,0],points[:,1],c=cluster_labels)
plt.figure()
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette score')
plt.plot(np.arange(len(sil))+2, sil,'-o')
The same techniques can be used on high dimensional data-sets. We use here the famoust MNIST dataset for integer digits, that we are downloading from tensorflow.
fmnist = tf.keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fmnist.load_data()
X=train_images[:5000,:].reshape(5000,-1)
print(X.shape)
image=X[1232,:].reshape(28,28)
plt.imshow(image)
We can cluster the images exactly as we did for the 2d dataset.
clusterer = KMeans(n_clusters=10, random_state=10)
cluster_labels = clusterer.fit_predict(X)
We can plot the cluster centers (wich are 2D figures!) to see if the clustering is learning correct patterns!
for iclust in range(10):
plt.figure()
plt.imshow(clusterer.cluster_centers_[iclust].reshape(28,28))
You can see that the model looks to assign one class to the same good. Nevertheless, using the cluster centers and with a further trick, in exercise 2 you will build a digit recognition model !
### In this exercise you are given the dataset points, consisting of high-dimensional data. It was built taking random
#samples from a number k of multimensional gaussians. The data is therefore made of k clusters but, being
#very high dimensional, you cannot visualize it. Your task it too use K-means combined with the Silouhette
#score to find the number of k.
# 1. Load the data using the function load_ex1_data_clust() , check the dimensionality of the data.
# 2. Fix a number of clusters k and define a KMeans clusterer object. Perform the fitting and compute the Silhouette score.
# Save the results on a list.
# 3. Plot the Silhouette scores as a function ok k? What is the number of clusters ?
# 4. Optional. Check the result that you found via umap.
#In this exercise you are asked to use the clustering performed by K-means to predict the good in the f-mnist dataset.
#Here we are using the clustering as a preprocessing for a supervised task. We need therefore the correct labels
#on a training set and #o test the result on a test set:
# 1. Load the dataset.
#fmnist = tf.keras.datasets.fashion_mnist
#(train_images, train_labels), (test_images, test_labels) = fmnist.load_data()
#X_train=train_images[:5000,:].reshape(5000,-1)
#y_train=train_labels[:5000]
#X_test=test_images[:1000,:].reshape(1000,-1)
#y_test=test_labels[:1000]
# 2. FITTING STEP: The fitting step consists first here in the computation of the cluster center, which was done during
# the presentation. Second, to each cluster center we need than to assign a good-label, which will be given by the
# majority class of the sample belonging to that cluster.
#
# You can use, if you want, the helper function most_common for this purpose.
#
# In detail you should.
# - fix a number of clusters (start with k=10) and define the cluster KMeans object. fit the model on the training set
# using the fit method
# - call the predict method of the KMeans object you defined on the training set and compute the cluster labels.
# Call them cluster_labels
# - use the function most_common with arguments (k,y_train, cluster_labels) to compute the assignement list.
# assignement[i] will be the majority class of the i-cluster
def most_common(nclusters, supervised_labels, cluster_labels):
"""
Args:
- nclusters : the number of clusteres
- supervised_labels : for each sample, the labelling provided by the training data ( e.g. in y_train or y_test)
- cluster_labels : for each good, the cluster it was assigned by K-Means using the predict method of the Kmeans object
Returns:
- a list "assignement" of lengths nclusters, where assignement[i] is the majority class of the i-cluster
"""
assignement=[]
for icluster in range(nclusters):
indices=list(supervised_labels[cluster_labels==icluster])
try:
chosen= max(set(indices), key=indices.count)
except ValueError :
print('Em')
chosen=1
assignement.append(chosen)
return assignement
# 3. Using the assignment list and the clusterer, check the performance on the test set.
# In detail you should.
# - using the predict methodo of your KMeans object, predict the cluster labels for the test set using X_test as an argument
# - using the cluster labels just predicted and the previously computed assignement[] list,
# predict what are according to your model the predicted goods for the test set
# - Using a call metrics.confusion_matrix( y_train, new_labels ) you can print the confusion matrix on the test set, which
# provides information on the quality of the fit.
# 4. Perform again steps 2 / 3 increasing the number of clusters from 10 to 40 what happens to the performance ?
K-Means is a modelling procedure which is biased towards clusters of circular shape and therefore does not always work perfectly, as the following examples show:
points=gm_load_th1()
clusterer = KMeans(n_clusters=3, random_state=10)
cluster_labels=clusterer.fit_predict(points)
plt.figure(figsize=(5,5))
plt.scatter(points[:,0],points[:,1],c=cluster_labels, s=2)
points=gm_load_th2()
clusterer = KMeans(n_clusters=2, random_state=10)
cluster_labels=clusterer.fit_predict(points)
plt.figure(figsize=(5,5))
plt.scatter(points[:,0],points[:,1],c=cluster_labels, s=2)
A Gaussian mixture model is able to fit these kinds of clusters. In a Gaussian mixture model each data-set is supposed to be a random point from the distribution: $$f(\mathbf{x})=\sum_c \pi_c N(\mathbf{\mu_c},\mathbf{\Sigma_c} )(\mathbf{x})$$ , which is called a Gaussian mixture. The parameters $\{\pi_c,\mathbf{\mu_c},\mathbf{\Sigma_c}\}$ are fitted from the data using a minimization procedure (maximum likelyhood via the EM algorithm) and $N_c$ is the chosen number of clusters.
Output of a GM computation:
Given the fitted parameters, $f_p(c)$ is computed as: $$f_p(c)=\frac{ \pi_c N(\mathbf{\mu_c},\mathbf{\Sigma_c} )(\mathbf{x_p})}{\sum_c \pi_c N(\mathbf{\mu_c},\mathbf{\Sigma_c} )(\mathbf{x_p})}, c=1...N_c$$
First of all we see how the Gaussian model behaves on our original example:
points=km_load_th1()
aic=[]
bic=[]
sil=[]
for i_comp in range(2,6):
plt.figure()
plt.title(str(i_comp))
clf = GaussianMixture(n_components=i_comp, covariance_type='full')
clf.fit(points)
cluster_labels=clf.predict(points)
plt.scatter(points[:,0],points[:,1],c=cluster_labels)
print(i_comp,clf.aic(points),clf.bic(points))
score=silhouette_score(points,cluster_labels)
aic.append(clf.aic(points))
bic.append(clf.bic(points))
sil.append(score)
plt.plot(np.arange(2,6),aic,'-o')
plt.title('aic')
plt.grid()
plt.figure()
plt.plot(np.arange(2,6),bic,'-o')
plt.title('bic')
plt.grid()
plt.figure()
plt.plot(np.arange(2,6),sil,'-o')
plt.title('silhouette')
plt.grid()
So in this case we get a comparable results, and also the probabilistic tools agree with the Silhouette score ! Let's see how the Gaussian mixtures behave in the examples where K-means was failing.
points=gm_load_th1()
clf = GaussianMixture(n_components=3, covariance_type='full')
clf.fit(points)
cluster_labels=clf.predict(points)
plt.figure(figsize=(5,5))
plt.scatter(points[:,0],points[:,1],c=cluster_labels, s=2)
points=gm_load_th2()
clf = GaussianMixture(n_components=2, covariance_type='full')
clf.fit(points)
cluster_labels=clf.predict(points)
plt.figure(figsize=(5,5))
plt.scatter(points[:,0],points[:,1],c=cluster_labels, s=2)
#In this exercise you need to load the dataset used to present K-means ( def km_load_th1() ) or the one used to discuss
# the Gaussian mixtures model ( def km_load_th1() ).
#As discussed, applying a fitting based on gaussian mixtures you can not only predict the cluster label for each point,
#but also a probability distribution over the clusters.
#From this probability distribution, you can compute for each point the entropy of the corresponging
#distribution (using for example scipy.stats.entropy) as an estimation of the undertainty of the prediction.
#Your task is to plot the data-cloud with a color proportional to the uncertainty of the cluster assignement.
# In detail you shoud:
# 1. Instantiate a GaussianMixture object with the number of clusters that you expect
# 2. fit the object on the dataset with the fit method
# 3. compute the cluster probabilities using the method predict_proba. This will return a matrix of dimension
# npoints x nclusters
# 4. use the entropy function ( from scipy.stats import entropy ) to evaluate for each point the uncertainty of the prediction
# 5. Plot the points colored accordingly to their uncertanty. You can use for example the code
#cm = plt.cm.get_cmap('RdYlBu')
#plt.scatter(x, y, c=colors, cmap=cm)
#plt.colorbar(sc)
# where colors is the list of entropies computed.
(Artificial) Neural network consists of layers of neurons. Artificial neuron, or perceptron, is in fact inspired by a biological neuron.
Such neuron first calculates the linear transformation of the input vector $\bar x$: $$z = \bar W \cdot \bar x + b = \sum {W_i x_i} + b$$ where $\bar W$ is vector of weights and $b$ - bias.
Combining multiple of such objects performing linear transformation would not bring any additional benefit, as the combined output would still be a linear combination of the inputs.
What gives actual power to neurons, is that they additionally perform the nonlinear transformation of the result using activation function $f$ $$y = f(z)$$
The most commonly used non-linear transformations are:
def ReLU(z):
return np.clip(z, a_min=0, a_max=np.max(z))
def LReLU(z, a=0.1):
return np.clip(z, a_min=0, a_max=np.max(z)) + np.clip(z, a_min=np.min(z), a_max=0) * a
def sigmoid(z):
return 1/(1 + np.exp(-z))
def step(z):
return np.heaviside(z, 0)
fig, ax = plt.subplots(1, 5, figsize=(16, 3))
z = np.linspace(-10, 10, 100)
ax[0].plot(z, ReLU(z))
ax[0].set_title('Rectified Linear Unit')
ax[1].plot(z, LReLU(z))
ax[1].set_title('Leaky Rectified Linear Unit')
ax[2].plot(z, sigmoid(z))
ax[2].set_title(r'$\sigma$(z)=$\frac{1}{1+e^z}$')
ax[3].plot(z, np.tanh(z))
ax[3].set_title('Hyperbolic tangent');
ax[4].plot(z, step(z)
)
ax[4].text(-6, 0.5, 'NOT USED', size=19, c='r')
ax[4].set_title('Step function');
for axi in ax:
axi.set_xlabel('z')
And the reason we don't use a simple step function, is that it's not differentiable or it's derivative is zero everywhere.
The last nonlinearity to mention here is softmax: $$y_i = Softmax(\bar z)_i = \frac{ e^{z_i}}{\sum_j e^{z_j}}$$
While each $z_i$ can hav any value, the corresponding $y_i\in[0,1]$, and $\sum_i y_i=1$, just like probabilities!
While these $y_i$ are only pseudo-probabilities, this nonlinearity allowes one to model probabilities, e.g. of a data-point belonging to a certain class.
In a fully connected neural network each layer is a set of N neurons, performing different transformations of all the same layer's inputs $\bar x = [x_i]$ producing output vector $\bar y = [y_j]_{i=1..N}$: $$y_j = f(\bar W_j \cdot \bar x + b_j)$$
Since output of each layer forms input of next layer, one can write for layer $l$: $$x^l_j = f(\bar W^l_j \cdot \bar x^{l-1} + b^l_j)$$ where $\bar x^0$ is network's input vactor.
The last part of the puzzle is the measure of network performance, which is used to optimize the network's parameters $W^l_j$ and $b^l_j$. Denoting the network's output for an input $x_i$ as $\hat y_i=\hat y_i(x_i)$ and given the label $y_i$:
L1 (MAE): $L = \sum_i |y_i-\hat y_i|$
In case of classification we can use cross-entropy, which shows "distance" from target distribution: $$L = - \sum_i \sum_c y_{i,c} \log(\hat y_{i,c})$$ Here $\hat y_{i,c}$ - pseudo-probability of $x_i$ belinging to class $c$ and $y_{i,c}$ uses 1-hot encoding:
$$y_{i,c}= \begin{cases} 1,& \text{if } x_i \text{ belongs to class } c\\ 0, & \text{otherwise} \end{cases}$$
Here we will build a neural network to fit an image.
image_big = imread('https://www.unibe.ch/unibe/portal/content/carousel/showitem940548/UniBE_Coronavirus_612p_eng.jpg')
image_big = image_big[...,0:3]/255
plt.imshow(image_big)
image = image_big[::5, ::5]
image = image.mean(axis=2, keepdims=True)
plt.imshow(image[...,0], cmap='gray')
h, w, c = image.shape
X = np.meshgrid(np.linspace(0, 1, w), np.linspace(0, 1, h))
X = np.stack(X, axis=-1).reshape((-1, 2))
Y = image.reshape((-1, c))
X.shape, Y.shape
model = tf.keras.models.Sequential([
tf.keras.layers.Flatten(input_shape=(2,)),
tf.keras.layers.Dense(c, activation='sigmoid'),
])
model.compile(optimizer='adam',
loss='mae',
metrics=['mse'])
model.summary()
hist = model.fit(X, Y, epochs=500, batch_size=2048)
Y_p = model.predict(X)
Y_p = Y_p.reshape((h,w,c))
im = plt.imshow(Y_p[...,0], cmap='gray')
fig, axs = plt.subplots(1, 2, figsize=(10,5))
axs[0].plot(hist.epoch, hist.history['loss'])
axs[0].set_title('loss')
axs[1].plot(hist.epoch, hist.history['mse'])
axs[1].set_title('mse')
plt.show()
Let's try the same with an RGB image:
image = image_big[::5, ::5]
plt.imshow(image)
h, w, c = image.shape
X = np.meshgrid(np.linspace(0, 1, w), np.linspace(0, 1, h))
X = np.stack(X, axis=-1).reshape((-1, 2))
Y = image.reshape((-1, c))
X.shape, Y.shape
model = tf.keras.models.Sequential([
tf.keras.layers.Flatten(input_shape=(2,)),
tf.keras.layers.Dense(c, activation='sigmoid'),
])
model.compile(optimizer='adam',
loss='mae',
metrics=['mse'])
model.summary()
But now we will save images during the course of training, at first every 2 epochs, then every 20, every 200 and finally every 1000.
(Remember: call to model.fit
does NOT reinitialize trainable variables. Every time it continues from the previous state):
ims = []
n_ep_tot = 0
for i in range(170):
if i % 10 == 0:
print(f'epoch {i}', end='\n')
ne = (2 if (i<50) else (20 if (i<100) else (200 if (i<150) else 1000)))
model.fit(X, Y, epochs=ne, batch_size=1*2048, verbose=0)
Y_p = model.predict(X)
Y_p = Y_p.reshape((h, w, c))
ims.append(Y_p)
n_ep_tot += ne
print(f'total numer of epochs trained:{n_ep_tot}')
%%capture
plt.rcParams["animation.html"] = "jshtml" # for matplotlib 2.1 and above, uses JavaScript
fig = plt.figure()
im = plt.imshow(ims[0])
def animate(i):
img = ims[i]
im.set_data(img)
return im
ani = animation.FuncAnimation(fig, animate, frames=len(ims))
ani
While the colors properly represent the target image, out model still poses very limited capacity, allowing it to effectively represent only 3 boundaries.
Let's upscale out model:
model = tf.keras.models.Sequential([
tf.keras.layers.Flatten(input_shape=(2,)),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(8, activation='relu'),
tf.keras.layers.Dense(c, activation='sigmoid'),
])
model.compile(optimizer='adam',
loss='mae',
metrics=['mse'])
model.summary()
ims = []
n_ep_tot = 0
for i in range(180):
if i % 10 == 0:
print(f'epoch {i}', end='\n')
ne = (2 if (i<50) else (20 if (i<100) else (200 if (i<150) else 1000)))
model.fit(X, Y, epochs=ne, batch_size=1*2048, verbose=0)
Y_p = model.predict(X)
Y_p = Y_p.reshape((h, w, c))
ims.append(Y_p)
n_ep_tot += ne
print(f'total numer of epochs trained:{n_ep_tot}')
%%capture
fig = plt.figure()
im = plt.imshow(ims[0])
def animate(i):
img = ims[i]
im.set_data(img)
return im
ani = animation.FuncAnimation(fig, animate, frames=len(ims))
ani