Softmax Regression
Softmax Regression implement from scatch
- Softmax function
- The Loss Function
- Optimize the Loss Function
- Example with Fashion Mnist
- Load the Dataset
- Initializing Model Parameters
- Defining the Softmax Operation
- Defining the model
- Defining the Loss Function
- Training
%%html
<style>div.output_area pre {white-space: pre; word-wrap: normal;}.container { width:100% !important; }</style>
The formular of Sofmax function is: $$a_i = \frac{\exp(z_i)}{\sum_{j=1}^C \exp(z_j)}, ~~ \forall i = 1, 2, \dots, C$$
Then, we can assume that: $$P(y_k = i | \mathbf{x}_k; \mathbf{W}) = a_i$$
The loss function for a single data sample is : $$J(\mathbf{W};\mathbf{x}_i, \mathbf{y}_i) = -\sum_{j=1}^C y_{ji}\log(a_{ji})$$
$$ = \sum_{j = 1}^C y_{ji}\log\left(\frac{\exp(\mathbf{w}_j^T\mathbf{x}_i)}{\sum_{k=1}^C \exp(\mathbf{w}_k^T\mathbf{x}_i)}\right) $$Then , $$\frac{\partial J_i(\mathbf{W})}{\partial \mathbf{W}} = \mathbf{x}_i [e_{1i}, e_{2i}, \dots, e_{Ci}] = \mathbf{x}_i\mathbf{e}_i^T$$
Then, $$\frac{\partial J(\mathbf{W})}{\partial \mathbf{W}} = \sum_{i=1}^N \mathbf{x}_i\mathbf{e}_i^T = \mathbf{X}\mathbf{E}^T$$
The formular for update $\mathbf{W} is:$
$$\mathbf{W} = \mathbf{W} +\eta \mathbf{x}_{i}(\mathbf{y}_i - \mathbf{a}_i)^T$$
from d2l import tensorflow as d2l
import tensorflow as tf
from IPython import display
def load_data_fashion_mnist(batch_size, resize=None):
mnist_train, mnist_test = tf.keras.datasets.fashion_mnist.load_data()
# Divide all numbers by 255 so that all pixel values are between
# 0 and 1, add a batch dimension at the last. And cast label to int32
process = lambda X, y: (tf.expand_dims(X, axis=3) / 255,
tf.cast(y, dtype='int32'))
return (
tf.data.Dataset.from_tensor_slices(process(*mnist_train)).batch(
batch_size).shuffle(len(mnist_train[0])),
tf.data.Dataset.from_tensor_slices(process(*mnist_test)).batch(
batch_size))
train_iter, test_iter = load_data_fashion_mnist(32)
for X, y in train_iter:
print(X.shape, X.dtype, y.shape, y.dtype)
break
num_inputs = 784
num_outputs = 10
W = tf.Variable(tf.random.normal(shape=(num_inputs, num_outputs),
mean=0, stddev=0.01))
b = tf.Variable(tf.zeros(num_outputs))
def softmax(X):
X_exp = tf.exp(X)
partition = tf.reduce_sum(X_exp, 1, keepdims=True)
return X_exp / partition # The broadcasting mechanism is applied here
def net(X):
return softmax(tf.matmul(tf.reshape(X, (-1, W.shape[0])), W) + b)
def cross_entropy(y_hat, y):
return -tf.math.log(tf.boolean_mask(
y_hat, tf.one_hot(y, depth=y_hat.shape[-1])))
def train_epoch_ch3(net, train_iter, loss, updater): #@save
for X, y in train_iter:
with tf.GradientTape() as tape:
y_hat = net(X)
l = loss(y_hat, y)
updater(X.shape[0], tape.gradient(l, updater.params))
def train_ch3(net, train_iter, test_iter, loss, num_epochs, updater): #@save
for epoch in range(num_epochs):
train_metrics = train_epoch_ch3(net, train_iter, loss, updater)
test_acc = evaluate_accuracy(net, test_iter)
animator.add(epoch + 1, train_metrics + (test_acc,))
train_loss, train_acc = train_metrics
assert train_loss < 0.5, train_loss
assert train_acc <= 1 and train_acc > 0.7, train_acc
assert test_acc <= 1 and test_acc > 0.7, test_acc
class Updater(): #@save
"""For updating parameters using minibatch stochastic gradient descent."""
def __init__(self, params, lr):
self.params = params
self.lr = lr
def __call__(self, batch_size, grads):
d2l.sgd(self.params, grads, self.lr, batch_size)
updater = Updater([W, b], lr=0.1)
num_epochs = 10
train_ch3(net, train_iter, test_iter, cross_entropy, num_epochs, updater)
def predict_ch3(net, test_iter, n=6): #@save
"""Predict labels (defined in Chapter 3)."""
for X, y in test_iter:
break
trues = d2l.get_fashion_mnist_labels(y)
preds = d2l.get_fashion_mnist_labels(tf.argmax(net(X), axis=1))
titles = [true +'\n' + pred for true, pred in zip(trues, preds)]
d2l.show_images(
tf.reshape(X[0:n], (n, 28, 28)), 1, n, titles=titles[0:n])
predict_ch3(net, test_iter)